**The idea is:**

 - Feature reduction with PCA
 - Data transformation (log, hot encoding, nan)
 - Test different regression models

**Things found:**

- Applying log transformation really increases the accuracy.
- Using PCA with 36 components makes the learning and testing much (much much) faster.
- Removing columns with more than 1000 NaNs gives better result than applying "mean" to them.
- There are outliers. Instead of removing them, using Huber seems to provide a good result. Huber is a model robust to outliers.

In [74]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
#import tflearn
#import tensorflow as tf
import seaborn
import warnings
warnings.filterwarnings('ignore')

train1="/home/barlesh/Projects/Hasaka/data/train_new.csv"
test1="/home/barlesh/Projects/Hasaka/data/test_new.csv"
out1="/home/barlesh/Projects/Hasaka/data/results.csv"

from subprocess import check_output
print(check_output(["ls", "/home/barlesh/Projects/Hasaka/data"]).decode("utf8"))

results.csv
test.csv
test_new.csv
train.csv
train_new.csv



## Data Load ##

I mix data and test to manipulate all the data just once. SalePrice is extracted to its own variable "labels". Finally, SalesPrice is remove from data.

In [75]:
train = pd.read_csv(train1)
labels=train["SalePrice"]
test = pd.read_csv(test1)
data = pd.concat([train,test],ignore_index=True)
data = data.drop("SalePrice", 1)
ids = test["Id"]

In [76]:
train.head()

Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,1,60,RL,65.0,8450,Pave,0,Reg,Lvl,...,0,,,,0,2,2008,WD,Normal,208500
1,1,2,20,RL,80.0,9600,Pave,0,Reg,Lvl,...,0,,,,0,5,2007,WD,Normal,181500
2,2,3,60,RL,68.0,11250,Pave,0,IR1,Lvl,...,0,,,,0,9,2008,WD,Normal,223500
3,3,4,70,RL,60.0,9550,Pave,0,IR1,Lvl,...,0,,,,0,2,2006,WD,Abnorml,140000
4,4,5,60,RL,84.0,14260,Pave,0,IR1,Lvl,...,0,,,,0,12,2008,WD,Normal,250000


In [77]:
# Count the number of rows in train
train.shape[0]

1460

In [78]:
# Count the number of rows in total
data.shape[0]

2919

In [79]:
# Count the number of NaNs each column has.
nans=pd.isnull(data).sum()
nans[nans>0]

Series([], dtype: int64)

In [80]:
# Remove id and columns with more than a thousand missing values
data=data.drop("Id", 1)
data=data.drop("Alley", 1)
data=data.drop("Fence", 1)
data=data.drop("MiscFeature", 1)
data=data.drop("PoolQC", 1)
data=data.drop("FireplaceQu", 1)

In [81]:
# Count the column types
data.dtypes.value_counts()

object     40
int64      26
float64     9
dtype: int64

## Data Manipulation ##

- Apply hot encoding, convert categorical variable into dummy/indicator variables.
- Fill NaN with median for that column.
- Log transformation.
- Change -inf to 0.

In [82]:
all_columns = data.columns.values
non_categorical = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", 
                   "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", 
                   "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", 
                   "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", 
                   "ScreenPorch","PoolArea", "MiscVal"]

categorical = [value for value in all_columns if value not in non_categorical]

In [83]:
# One Hot Encoding and nan transformation
data = pd.get_dummies(data)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
data = imp.fit_transform(data)

# Log transformation
data = np.log(data)
labels = np.log(labels)

# Change -inf to 0 again
data[data==-np.inf]=0

## Feature reduction ##

There are many features, so I am going to use PCA to reduce them. The idea is to start with n_components = number of columns. Then select the number of components that add up to 1 variance_ratio.

In [84]:
pca = PCA(whiten=True)
pca.fit(data)
variance = pd.DataFrame(pca.explained_variance_ratio_)
np.cumsum(pca.explained_variance_ratio_)

array([ 0.22082383,  0.39898977,  0.51338389,  0.61063673,  0.68641341,
        0.75337537,  0.8081475 ,  0.8539118 ,  0.88815001,  0.91427362,
        0.93521893,  0.95212872,  0.96146401,  0.96877832,  0.97461264,
        0.9801822 ,  0.98528517,  0.989343  ,  0.99153157,  0.99360662,
        0.99516051,  0.99607108,  0.9969237 ,  0.99764987,  0.99832897,
        0.99883967,  0.99923139,  0.99949113,  0.99972032,  0.99989236,
        0.99995277,  0.9999978 ,  0.99999931,  0.99999999,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
        1.        ,  1.        ,  1.        ,  1.        ,  1.  

In [85]:
pca = PCA(n_components=36,whiten=True)
pca = pca.fit(data)
dataPCA = pca.transform(data)

## Data Model Selection ##

Simple test to run multiple models against our data. First, with raw features. No PCA.

In [86]:
# Split traing and test
train = data[:1460]
test = data[1460:]

In [87]:
# R2 Score

def lets_try(train,labels):
    results={}
    def test_model(clf):
        
        cv = KFold(n_splits=5,shuffle=True,random_state=45)
        r2 = make_scorer(r2_score)
        r2_val_score = cross_val_score(clf, train, labels, cv=cv,scoring=r2)
        scores=[r2_val_score.mean()]
        return scores

    clf = linear_model.LinearRegression()
    results["Linear"]=test_model(clf)
    
    clf = linear_model.Ridge()
    results["Ridge"]=test_model(clf)
    
    clf = linear_model.BayesianRidge()
    results["Bayesian Ridge"]=test_model(clf)
    
    clf = linear_model.HuberRegressor()
    results["Hubber"]=test_model(clf)
    
    clf = linear_model.Lasso(alpha=1e-4)
    results["Lasso"]=test_model(clf)
    
    clf = BaggingRegressor()
    results["Bagging"]=test_model(clf)
    
    clf = RandomForestRegressor()
    results["RandomForest"]=test_model(clf)
    
    clf = AdaBoostRegressor()
    results["AdaBoost"]=test_model(clf)
    
    clf = svm.SVR()
    results["SVM RBF"]=test_model(clf)
    
    clf = svm.SVR(kernel="linear")
    results["SVM Linear"]=test_model(clf)
    
    results = pd.DataFrame.from_dict(results,orient='index')
    results.columns=["R Square Score"] 
    results=results.sort(columns=["R Square Score"],ascending=False)
    results.plot(kind="bar",title="Model Scores")
    axes = plt.gca()
    axes.set_ylim([0.5,1])
    return results

lets_try(train,labels)

Unnamed: 0,R Square Score
Linear,0.856139
Lasso,0.855225
Bayesian Ridge,0.854962
SVM Linear,0.849812
RandomForest,0.846358
Bagging,0.846228
Ridge,0.839068
Hubber,0.83186
SVM RBF,0.810558
AdaBoost,0.798159


Now, let's try the same but using data with PCA applied.

In [98]:
# Split traing and test
train = dataPCA[:1460]
test = dataPCA[1460:]

lets_try(train,labels)

Unnamed: 0,R Square Score
Hubber,0.861132
SVM Linear,0.859394
Bayesian Ridge,0.856218
Lasso,0.85619
Ridge,0.856165
Linear,0.856139
SVM RBF,0.784373
RandomForest,0.710882
Bagging,0.699284
AdaBoost,0.660673


In [97]:
from sklearn.metrics import mean_squared_error

cv = KFold(n_splits=5,shuffle=True,random_state=45)

parameters = {'n_estimators': [100,200,300, ], 'max_features': [None,] ,'n_jobs': [5,], 'max_depth': [2,4,6,8,] }
mse1 = make_scorer(mean_squared_error)

clf = RandomForestRegressor()
r2 = make_scorer(r2_score)
grid_obj = GridSearchCV(clf, parameters, cv=cv,scoring=r2)
grid_fit = grid_obj.fit(train, labels)
best_clf = grid_fit.best_estimator_

best_clf.fit(train,labels)

In [100]:

best_clf.fit(train,labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features=None, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=5, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [71]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
mse1 = make_scorer(mean_squared_error)
scores = cross_val_score(clf, train, labels, cv=10, scoring=mse1)
print(scores)
print(np.mean(scores))

[ 0.03225796  0.03554197  0.04319547  0.05179107  0.0560194   0.03820767
  0.03859552  0.0404297   0.05255436  0.03922798]
0.0427821109159


In [90]:

predictions_forest = best_clf.predict(test)

# reverse log(x+1)
predictions_forest = np.exp(predictions_forest)



sub = pd.DataFrame({
        "Id": ids,
        "SalePrice": predictions_forest
    })

sub.to_csv(out1, index=False)











Simple Neural Network
---------------------

Now I am going to try a simple neural network, to see if i can improve the result.

In [None]:
# Shape the labels
labels_nl = labels
labels_nl = labels_nl.reshape(-1,1)

In [None]:
tf.reset_default_graph()
r2 = tflearn.R2()
net = tflearn.input_data(shape=[None, train.shape[1]])
net = tflearn.fully_connected(net, 30, activation='linear')
net = tflearn.fully_connected(net, 10, activation='linear')
net = tflearn.fully_connected(net, 1, activation='linear')
sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.01, decay_step=100)
net = tflearn.regression(net, optimizer=sgd,loss='mean_square',metric=r2)
model = tflearn.DNN(net)

In [None]:
model.fit(train, labels_nl,show_metric=True,validation_set=0.2,shuffle=True,n_epoch=50)

In [None]:
# Make predictions

predictions_huber = best_clf.predict(test)
predictions_DNN = model.predict(test)
predictions_huber = np.exp(predictions_huber)
predictions_DNN = np.exp(predictions_DNN)
predictions_DNN = predictions_DNN.reshape(-1,)

sub = pd.DataFrame({
        "Id": ids,
        "SalePrice": predictions_DNN
    })

sub.to_csv("prices_submission.csv", index=False)
#print(sub)