# House pricing-Kaggle 
## by: Guillermo Campollo
### 5/16/2020

## Importing our libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler



## Importing our datasets and splitting X and Y

In [2]:
data=pd.read_csv('../data/raw/train.csv')
data_predict=pd.read_csv('../data/raw/test.csv')
#data.columns = data.columns.str.replace(' ', '')
X=data.drop(["SalePrice"],1)
y=data.SalePrice.values
#Dropping the 5 lines with NaNs before appending our test set
X=X.dropna(subset=["MSZoning", "SaleType"],axis=0)
sep=len(X) #this should give us our separator
X=X.append(data_predict)
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


# Data Preprocessing and Feature Scaling

In [3]:
#Here we might take drop NaN columns to see if it improves
X=X.drop(columns=["Alley","PoolQC","Fence","MiscFeature"])

### Dummies for categorical

In [4]:
#Initial number of NaNs
columns=list(X.columns) #All column names
objects=X.select_dtypes(include='object').columns
numbers=X.select_dtypes(exclude='object').columns#All the object columns needed to be encoded
for i in objects:
    dummy=pd.get_dummies(X[i])
    X=pd.concat([X,dummy],axis=1)
X=X.drop(columns=objects,axis=1)


### Dealing with numeric variables

In [5]:
#Now we take care of our numerical varaibles by using imputescaler
for i in numbers[1:]:
    sc=SimpleImputer(missing_values=np.nan, strategy='mean')
    X[i]=sc.fit_transform(X[i].values.reshape(-1,1))
sum(X.isnull().sum()) #Now we have no missing values

0

In [6]:
#We drop de Id class
X=X.drop(columns="Id")

### Feature Scaling

In [7]:
for i in numbers[1:]:
    sc=StandardScaler()
    X[i]=sc.fit_transform(X[i].values.reshape(-1,1))
X

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,ConLw,New,Oth,WD,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,0.067331,-0.202068,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.525202,0.580907,-0.29313,...,0,0,0,1,0,0,0,0,1,0
1,-0.873616,0.501870,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.572250,1.178112,-0.29313,...,0,0,0,1,0,0,0,0,1,0
2,0.067331,-0.061280,0.137197,0.646183,-0.507284,0.980221,0.848965,0.334828,0.097873,-0.29313,...,0,0,0,1,0,0,0,0,1,0
3,0.302568,-0.436714,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.572250,-0.494941,-0.29313,...,0,0,0,1,1,0,0,0,0,0
4,0.067331,0.689587,0.518903,1.355551,-0.507284,0.947203,0.753229,1.387486,0.468931,-0.29313,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.419700,-2.266952,-1.043937,-1.481920,1.289758,-0.043346,-0.682812,-0.572250,-0.969192,-0.29313,...,0,0,0,1,0,0,0,0,1,0
1455,2.419700,-2.266952,-1.049263,-1.481920,-0.507284,-0.043346,-0.682812,-0.572250,-0.415899,-0.29313,...,0,0,0,1,1,0,0,0,0,0
1456,-0.873616,4.256207,1.246808,-0.772552,1.289758,-0.373528,0.561757,-0.572250,1.718232,-0.29313,...,0,0,0,1,1,0,0,0,0,0
1457,0.655424,-0.342855,0.034605,-0.772552,-0.507284,0.683057,0.370284,-0.572250,-0.229272,-0.29313,...,0,0,0,1,0,0,0,0,1,0


In [8]:
#Feature scaling our y variable
sc_y=StandardScaler()
y=sc_y.fit_transform(y.reshape(-1,1))
y

array([[ 0.34727322],
       [ 0.00728832],
       [ 0.53615372],
       ...,
       [ 1.07761115],
       [-0.48852299],
       [-0.42084081]])

### Now we split again our data test and separate our train data

In [9]:
data_train=X.iloc[:1460,:]
data_test=X.iloc[1460:,:]

In [10]:
X_train,X_test,y_train,y_test=train_test_split(data_train, y, test_size = 0.2, random_state = 0)

# Creating our SVM Regressor model

In [11]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
y_pred=sc_y.inverse_transform(regressor.predict(X_test))

In [13]:
regressor.score(X_test,y_test)

0.8241294752526499

#### We got 82% Accuracy... Not bad

# Here we create our predictions for submitting to kaggle

In [17]:
y_pred=sc_y.inverse_transform(regressor.predict(data_test)) #Predictions created

In [18]:
results=pd.DataFrame({"Id":data_predict.Id.values,"SalePrice":y_pred}) #Format for csv

In [20]:
results.to_csv('../predictions/house_pricing.csv') #CSV export