**The idea is:**

 - Feature reduction with PCA
 - Data transformation (log, hot encoding, nan)
 - Test different regression models

**Things found:**

- Applying log transformation really increases the accuracy.
- Using PCA with 36 components makes the learning and testing much (much much) faster.
- Removing columns with more than 1000 NaNs gives better result than applying "mean" to them.
- There are outliers. Instead of removing them, using Huber seems to provide a good result. Huber is a model robust to outliers.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
#import tflearn
#import tensorflow as tf
import seaborn
import warnings
import subprocess
warnings.filterwarnings('ignore')

from subprocess import check_output
#subprocess.check_output(["ls","C:/Users/yuvalanavim/Desktop/test_regrission"])

In [2]:
data_path='/home/barlesh/Projects/Data_Mining_Project/data/'
train_file='train.csv'
test_file='test.csv'
train_file_new='train_new.csv'
test_file_new='test_new.csv'
result_path='results/'
result_file='res.csv'

## Data Load ##

I mix data and test to manipulate all the data just once. SalePrice is extracted to its own variable "labels". Finally, SalesPrice is remove from data.

In [3]:
train = pd.read_csv(data_path+train_file_new)
labels=train["SalePrice"]
test = pd.read_csv(data_path+test_file_new)
data = pd.concat([train,test],ignore_index=True)
data = data.drop("SalePrice", 1)
ids = test["Id"]

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,1,60,RL,65.0,8450,Pave,0,Reg,Lvl,...,0,,,,0,2,2008,WD,Normal,208500
1,1,2,20,RL,80.0,9600,Pave,0,Reg,Lvl,...,0,,,,0,5,2007,WD,Normal,181500
2,2,3,60,RL,68.0,11250,Pave,0,IR1,Lvl,...,0,,,,0,9,2008,WD,Normal,223500
3,3,4,70,RL,60.0,9550,Pave,0,IR1,Lvl,...,0,,,,0,2,2006,WD,Abnorml,140000
4,4,5,60,RL,84.0,14260,Pave,0,IR1,Lvl,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Count the number of rows in train
train.shape[0]

1460

In [6]:
# Count the number of rows in total
data.shape[0]

2919

In [7]:
# Count the number of NaNs each column has.
nans=pd.isnull(data).sum()
nans[nans>0]

Series([], dtype: int64)

In [7]:
# Remove id and columns with more than a thousand missing values
data=data.drop("Id", 1)
data=data.drop("Alley", 1)
data=data.drop("Fence", 1)
data=data.drop("MiscFeature", 1)
data=data.drop("PoolQC", 1)
data=data.drop("FireplaceQu", 1)

In [8]:
# Count the column types
data.dtypes.value_counts()

object     38
int64      25
float64    11
dtype: int64

## Data Manipulation ##

- Apply hot encoding, convert categorical variable into dummy/indicator variables.
- Fill NaN with median for that column.
- Log transformation.
- Change -inf to 0.

In [9]:
all_columns = data.columns.values
non_categorical = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", 
                   "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", 
                   "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", 
                   "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", 
                   "ScreenPorch","PoolArea", "MiscVal"]

categorical = [value for value in all_columns if value not in non_categorical]

In [10]:
# One Hot Encoding and nan transformation
data = pd.get_dummies(data)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
data = imp.fit_transform(data)

# Log transformation
data = np.log(data)
labels = np.log(labels)

# Change -inf to 0 again
data[data==-np.inf]=0

## Feature reduction ##

There are many features, so I am going to use PCA to reduce them. The idea is to start with n_components = number of columns. Then select the number of components that add up to 1 variance_ratio.

In [11]:
pca = PCA(whiten=True)
pca.fit(data)
variance = pd.DataFrame(pca.explained_variance_ratio_)
np.cumsum(pca.explained_variance_ratio_)

array([ 0.2248857 ,  0.40281429,  0.52425789,  0.62418823,  0.69580422,
        0.75944463,  0.8116806 ,  0.85647038,  0.89178708,  0.92273755,
        0.94898868,  0.95842727,  0.96637545,  0.97380464,  0.97971901,
        0.98501952,  0.98918839,  0.99199181,  0.99386559,  0.99520919,
        0.99611479,  0.99695667,  0.99771023,  0.99842564,  0.9989402 ,
        0.99933882,  0.99959949,  0.99978254,  0.99988174,  0.99993998,
        0.99998599,  0.99999658,  0.99999871,  0.99999943,  0.99999999,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.  

In [12]:
pca = PCA(n_components=36,whiten=True)
pca = pca.fit(data)
dataPCA = pca.transform(data)

## Data Model Selection ##

Simple test to run multiple models against our data. First, with raw features. No PCA.

In [13]:
# Split traing and test
train = data[:1460]
test = data[1460:]

In [14]:
# R2 Score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


def lets_try(train,labels):
    results={}
    def test_model(clf):
        
        cv = KFold(n_splits=5,shuffle=True,random_state=45)
        mse0 = make_scorer(mean_squared_error)
        mse0_val_score = cross_val_score(clf, train, labels, cv=cv,scoring=mse0)
        scores=[mse0_val_score.mean()]
        return scores

    clf = linear_model.LinearRegression()
    results["Linear"]=test_model(clf)
    
    clf = linear_model.Ridge()
    results["Ridge"]=test_model(clf)
    
    clf = linear_model.BayesianRidge()
    results["Bayesian Ridge"]=test_model(clf)
    
    clf = linear_model.HuberRegressor()
    results["Hubber"]=test_model(clf)
    
    clf = linear_model.Lasso(alpha=1e-4)
    results["Lasso"]=test_model(clf)
    
    clf = BaggingRegressor()
    results["Bagging"]=test_model(clf)
    
    clf = RandomForestRegressor()
    results["RandomForest"]=test_model(clf)
    
    clf = AdaBoostRegressor()
    results["AdaBoost"]=test_model(clf)
    
    clf = svm.SVR()
    results["SVM RBF"]=test_model(clf)
    
    clf = svm.SVR(kernel="linear")
    results["SVM Linear"]=test_model(clf)
    
    results = pd.DataFrame.from_dict(results,orient='index')
    results.columns=["mean_squared_error"] 
    results=results.sort(columns=["mean_squared_error"],ascending=False)
    results.plot(kind="bar",title="Model Scores")
    axes = plt.gca()
    axes.set_ylim([0.5,1])
    return results

lets_try(train,labels)

Unnamed: 0,mean_squared_error
AdaBoost,0.03208
SVM RBF,0.029442
RandomForest,0.026025
Hubber,0.025425
Ridge,0.025091
Bagging,0.024431
SVM Linear,0.023461
Bayesian Ridge,0.022895
Linear,0.022695
Lasso,0.022695


Now, let's try the same but using data with PCA applied.

In [23]:
# Split traing and test
train = dataPCA[:1460]
test = dataPCA[1460:]

lets_try(train,labels)

Unnamed: 0,mean_squared_error
AdaBoost,0.052733
Bagging,0.046139
RandomForest,0.043924
SVM RBF,0.033351
Linear,0.022695
Ridge,0.022691
Lasso,0.022684
Bayesian Ridge,0.022683
SVM Linear,0.02198
Hubber,0.021777


In [24]:
cv = KFold(n_splits=5,shuffle=True,random_state=45)

parameters = {'alpha': [1000,200,100,10],
              'epsilon' : [1.2,1.25,1.50,1.7],
              'tol' : [1e-10]}

clf = linear_model.HuberRegressor()
r2 = make_scorer(r2_score)
grid_obj = GridSearchCV(clf, parameters, cv=cv,scoring=r2)
grid_fit = grid_obj.fit(train, labels)
best_clf = grid_fit.best_estimator_ 

best_clf.fit(train,labels)

HuberRegressor(alpha=100, epsilon=1.25, fit_intercept=True, max_iter=100,
        tol=1e-10, warm_start=False)

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
mse1 = make_scorer(mean_squared_error)
scores = cross_val_score(clf, train, labels, cv=10, scoring=mse1)
print(scores)
print(np.mean(scores))

[ 0.01816323  0.01446544  0.01732366  0.03132449  0.02611438  0.01709237
  0.01553854  0.01677424  0.03745242  0.01977055]
0.0214019324452


In [26]:

predictions_huber = best_clf.predict(test)
predictions_huber = np.exp(predictions_huber)
sub = pd.DataFrame({
        "Id": ids,
        "SalePrice": predictions_huber
    })
sub.to_csv("prices_huber1.csv", index=False)
print(sub)

        Id      SalePrice
0     1461  125634.332715
1     1462  156328.599906
2     1463  188741.597383
3     1464  205770.311280
4     1465  189372.909898
5     1466  178681.501287
6     1467  179472.043456
7     1468  165370.798502
8     1469  197880.321555
9     1470  112413.323545
10    1471  194284.990891
11    1472  100752.687283
12    1473   97447.400774
13    1474  145476.638901
14    1475  107890.142830
15    1476  298584.983669
16    1477  237563.835783
17    1478  273118.370511
18    1479  266711.924184
19    1480  397868.372156
20    1481  295304.416582
21    1482  211222.230065
22    1483  190166.235252
23    1484  164809.067137
24    1485  194596.520682
25    1486  200355.441281
26    1487  284752.908219
27    1488  237995.529341
28    1489  187415.228172
29    1490  224288.820580
...    ...            ...
1429  2890   73369.977831
1430  2891  140937.919956
1431  2892   69287.343706
1432  2893  104920.418637
1433  2894   69616.560743
1434  2895  293983.782180
1435  2896  

SVM tranning and predict

In [18]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, train, labels, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [19]:

print('Training on all data: ')
model_svm = svm.SVR(kernel="linear", C=3,)
model_svm.fit(train, labels)



cv_rmse_svm = rmse_cv(model_svm).mean()
print (cv_rmse_svm)






Training on all data: 
0.148731554646


In [20]:
predictions = model_svm.predict(test)
predictions_svm = np.exp(predictions)
sub = pd.DataFrame({
        "Id": ids,
        "SalePrice": predictions_svm
    })
sub.to_csv("prices_svm1.csv", index=False)

In [21]:
mse1 = make_scorer(mean_squared_error)
scores = cross_val_score(clf, train, labels, cv=10, scoring=mse1)
print(scores)
print(np.mean(scores))

[ 0.02345859  0.01640437  0.02317852  0.03338032  0.029027    0.0226658
  0.02265506  0.02255614  0.03490317  0.02274871]
0.0250977686718


Simple Neural Network
---------------------

Now I am going to try a simple neural network, to see if i can improve the result.

In [None]:
# Shape the labels
labels_nl = labels
labels_nl = labels_nl.reshape(-1,1)

In [None]:
tf.reset_default_graph()
r2 = tflearn.R2()
net = tflearn.input_data(shape=[None, train.shape[1]])
net = tflearn.fully_connected(net, 30, activation='linear')
net = tflearn.fully_connected(net, 10, activation='linear')
net = tflearn.fully_connected(net, 1, activation='linear')
sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.01, decay_step=100)
net = tflearn.regression(net, optimizer=sgd,loss='mean_square',metric=r2)
model = tflearn.DNN(net)

In [None]:
model.fit(train, labels_nl,show_metric=True,validation_set=0.2,shuffle=True,n_epoch=50)

In [None]:
# Make predictions

predictions_huber = best_clf.predict(test)
predictions_DNN = model.predict(test)
predictions_huber = np.exp(predictions_huber)
predictions_DNN = np.exp(predictions_DNN)
predictions_DNN = predictions_DNN.reshape(-1,)

sub = pd.DataFrame({
        "Id": ids,
        "SalePrice": predictions_DNN
    })

sub.to_csv("prices_submission.csv", index=False)
#print(sub)