In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE, SelectKBest, f_classif, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
%matplotlib inline

sns.set(style="white")

In [27]:
train = pd.read_csv('train.csv',index_col=0)
test = pd.read_csv('test.csv',index_col=0)
train.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,130500
544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,...,0,,,,0,4,2009,WD,Normal,220000
153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2010,WD,Abnorml,109000
318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,174000
255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,138500


In [28]:
print(train.shape)

(2051, 81)


In [29]:
#Split data into feature matrix and target vector
X = train.drop(['SalePrice','Sale Condition'],axis=1)
y = train['SalePrice']
print(X.shape)
X = pd.concat([X,test],axis=0)
print(X.shape)

(2051, 79)
(2930, 79)


# Clean Data

In [30]:
#Print number of NAs in each column that has NAs
X.isnull().sum()[X.isnull().sum()>0]

Lot Frontage       490
Alley             2732
Mas Vnr Type        23
Mas Vnr Area        23
Bsmt Qual           80
Bsmt Cond           80
Bsmt Exposure       83
BsmtFin Type 1      80
BsmtFin SF 1         1
BsmtFin Type 2      81
BsmtFin SF 2         1
Bsmt Unf SF          1
Total Bsmt SF        1
Electrical           1
Bsmt Full Bath       2
Bsmt Half Bath       2
Fireplace Qu      1422
Garage Type        157
Garage Yr Blt      159
Garage Finish      159
Garage Cars          1
Garage Area          1
Garage Qual        159
Garage Cond        159
Pool QC           2917
Fence             2358
Misc Feature      2824
dtype: int64

In [31]:
#Convert ordered categorical columns using label encoding, being sure to interpret NA valuess correctly
#For example, an NA value in Bsmt Qual is because the house doesn't have a basement, not because the data is 
#actually missing.

#create dict for conversions
label_encoder_dict = {
    'Lot Shape' : ['IR3','IR2','IR1','Reg'],
    'Exter Qual' : ['Po','Fa','TA','Gd','Ex'],
    'Exter Cond' : ['Po','Fa','TA','Gd','Ex'],
    'Bsmt Qual' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Bsmt Cond' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Bsmt Exposure' : [np.nan,'No','Mn','Av','Gd'],
    'BsmtFin Type 1' : [np.nan,'Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
    'BsmtFin Type 2' : [np.nan,'Unf','LwQ','Rec','BLQ','ALQ','GLQ'],
    'Heating QC' : ['Po','Fa','TA','Gd','Ex'],
    'Kitchen Qual' : ['Po','Fa','TA','Gd','Ex'],
    'Functional' : ['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'],
    'Fireplace Qu' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Garage Finish' : [np.nan,'Unf','RFn','Fin'],
    'Garage Qual' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Garage Cond' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Paved Drive' : ['N','P','Y'],
    'Pool QC' : [np.nan,'Po','Fa','TA','Gd','Ex'],
    'Fence' : [np.nan,'MnWw','GdWo','MnPrv','GdPrv']
}

#apply conversion to appropriate columns
for key in label_encoder_dict.keys():
    X[key] = X[key].map(lambda x: label_encoder_dict[key].index(x))
print(X.shape)

(2930, 79)


In [32]:
X.isnull().sum()[X.isnull().sum()>0]

Lot Frontage       490
Alley             2732
Mas Vnr Type        23
Mas Vnr Area        23
BsmtFin SF 1         1
BsmtFin SF 2         1
Bsmt Unf SF          1
Total Bsmt SF        1
Electrical           1
Bsmt Full Bath       2
Bsmt Half Bath       2
Garage Type        157
Garage Yr Blt      159
Garage Cars          1
Garage Area          1
Misc Feature      2824
dtype: int64

In [33]:
#do 1-hot-encdoing for categorical columns
X = pd.get_dummies(X)
print(X.shape)
#have to add MS SubClass manually as it's a categorical feature stored as integers
X = pd.get_dummies(X,columns=['MS SubClass'])
#Convert Garage Yr Blt column to garage age and then drop Garage Yr Blt column as it's unnecessary now and drop
#Lot Frontage becuase it's correlate with Lot Area and has many missing values
X['Garage Age'] = 2017 - X['Garage Yr Blt']
X.drop(['Garage Yr Blt','Lot Frontage','PID'],axis=1,inplace=True)

(2930, 231)


In [34]:
#re-split into original train/test
X_test = X.iloc[2051:,:].copy()
X_train = X.iloc[:2051,:].copy()

In [35]:
#Create new train-test-split from train data
diff = 10000
thresh = 100
while diff > thresh:
    X_train_train, X_train_test, y_train,y_test = train_test_split(X_train,y,test_size=.1)
    test_mean = y_test.mean()
    train_mean = y_train.mean()
    diff = abs(test_mean - train_mean)
    print(diff,abs(y_train.std()-y_test.std()))

216.58601310284575 4376.122223162936
7944.399486936629 8106.023392510164
2343.583153103391 11072.693077446907
13556.450732759753 8718.155386478174
14433.420585681568 19401.1389459996
4065.140234693623 9544.213244202023
5217.454692556639 10360.854086801817
8227.97322335362 483.5512916148873
287.6350172336679 4197.859066123623
1894.097547820129 5981.848055550436
1967.5371457889269 6445.755835364966
8613.851764148712 10551.05312729346
2908.2762938406086 12427.774987302502
9918.819538506068 6970.062768112723
1264.7130107611883 714.2472135855933
889.0825242718565 3853.152989024704
5629.711050595943 2824.9220187215105
8258.840824058716 11384.548311836726
5355.683160996676 9425.677343013842
8449.149022548489 9399.467419784502
569.4819138579769 3139.8496498686436
1938.5316336464311 1717.7056070147228
8548.189020443591 2206.9283904159092
3072.5577788302035 8240.82772626805
9070.600889309862 5337.570206754463
3476.195787618053 2347.577969631573
5996.243249927618 14611.544990036695
6229.037782513

In [36]:
#impute impossible values to be the median of the train set
median_value = X_train_train['Garage Age'].median()
X_train_train['Garage Age'] = X_train_train['Garage Age'].map(lambda x: median_value if x < 0 else x)
X_train_test['Garage Age'] = X_train_test['Garage Age'].map(lambda x: median_value if x < 0 else x)
X_test['Garage Age'] = X_test['Garage Age'].map(lambda x: median_value if x < 0 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [37]:
#impute garage age with median
X_train_train['Garage Age'] = X_train_train['Garage Age'].map(lambda x: median_value if np.isnan(x) else x)
X_train_test['Garage Age'] = X_train_test['Garage Age'].map(lambda x: median_value if np.isnan(x) else x)
X_test['Garage Age'] = X_test['Garage Age'].map(lambda x: median_value if np.isnan(x) else x)
#impute other columns with 0 as they represent a garage of 0 sq. ft., basement of 0 sq. ft., etc.
X_train_train.fillna(0,inplace=True)
X_train_test.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


# Scale Training and Test Sets

In [38]:
#scale feature matrix
sc = StandardScaler()
X_train_train_sc = sc.fit_transform(X_train_train)
X_train_test_sc = sc.transform(X_train_test)
X_test_sc = sc.transform(X_test)

# Elastic Net

In [39]:
# features = {
#     'n_features':[],
#     'rms_mean':[],
#     'alpha':[],
#     'l1_ratio':[],
#     'rms_std':[],
#     'test_score':[]
# }

# for n_features in np.arange(5,60,5):
#     print('Performing grid search using {} features'.format(n_features))
#     #Select top N features
#     skb = SelectKBest(f_regression,n_features)
#     skb.fit(X_train_train_sc,y_train)

#     #Do grid search to find best model hyperparameters
#     params = {
#         'alpha' : np.linspace(1,20,5),
#         'l1_ratio' : [.1,.5,.9,.95,.97,.99,.995,.9975]
#     }

#     en_model = GridSearchCV(ElasticNet(),param_grid=params,cv=5)
#     en_model.fit(X_train_train_sc[:,skb.get_support()],y_train)
#     rms = np.sqrt(np.abs(cross_val_score(en_model.best_estimator_,X_train_train_sc[:,skb.get_support()],y_train,
#         cv=10,scoring='neg_mean_squared_error')))
#     features['n_features'].append(n_features)
#     features['alpha'].append(en_model.best_params_['alpha'])
#     features['l1_ratio'].append(en_model.best_params_['l1_ratio'])
#     features['rms_mean'].append(rms.mean())
#     features['rms_std'].append(rms.std())
#     y_pred = en_model.predict(X_train_test_sc[:,skb.get_support()])
#     features['test_score'].append(np.sqrt(mean_squared_error(y_test,y_pred)))
# features = pd.DataFrame(features)
# features['test-train_score'] = features.test_score - features.rms_mean
    
# #Create arrays to construct prediction file
# ids = test.index.values.reshape(-1,1)
# preds = en_model.predict(X_test_sc[:,rfe.support_]).reshape(-1,1)

# #Create a prediction DF and convert ID back to int
# pred_df = pd.DataFrame(np.hstack(([ids,preds])),columns=['Id','SalePrice'])
# pred_df.Id = pred_df.Id.astype(int)

# #Write to csv
# pred_df.to_csv('predictions.csv',index=False)

# Adaptive Boost Regressor

In [40]:
# features = {
#     'n_features':[],
#     'rms_mean':[],
#     'rms_std':[],
#     'max_depth':[],
#     'n_trees':[]
# }

# #Feature selection and grid search for AdaBoost Regressor
# for n_features in [50]:#np.arange(40,120,10):
#     #Select top N features
#     skb = SelectKBest(f_regression,n_features)
#     skb.fit(X_train_sc,y)
# #     rfe = RFE(AdaBoostRegressor(),n_features,step=10)
# #     rfe.fit(X_train_sc,y)

#     #Do grid search to find best model hyperparameters
#     params = {
#         'n_estimators' : np.arange(60,120,10),
# #         'learning_rate' : [.7,.8,.9,1],
#         'base_estimator__max_depth' : [10,20,50],
#         'base_estimator__max_features' : ['sqrt']
#     }

#     abr_model = GridSearchCV(AdaBoostRegressor(DecisionTreeRegressor()),param_grid=params,cv=5)
#     abr_model.fit(X_train_sc[:,skb.get_support()],y)
#     features['n_features'].append(n_features)
#     features['max_depth'].append(abr_model.best_params_['base_estimator__max_depth'])
#     features['n_trees'].append(abr_model.best_params_['n_estimators'])
#     rms = np.sqrt(np.abs(cross_val_score(abr_model.best_estimator_,X_train_sc[:,skb.get_support()],y,cv=5,
#         scoring='neg_mean_squared_error')))
#     features['rms_mean'].append(rms.mean())
#     features['rms_std'].append(rms.std())
# features = pd.DataFrame(features)

# Gradient Boost Regressor

In [53]:
# features = {
#     'n_features':[],
#     'max_depth':[],
#     'learning_rate':[],
#     'n_estimators':[],
#     'max_features':[],
#     'subsample':[],
#     'rms_mean':[],
#     'rms_std':[],
#     'test_score':[]
# }

# #Feature selection and grid search for Gradient Boost Regressor
# for n_features in np.arange(5,70,5):
#     print('Performing grid search using {} features'.format(n_features))
#     #Select top N features
#     skb = SelectKBest(f_regression,n_features)
#     skb.fit(X_train_train_sc,y_train)

#     #Do grid search to find best model hyperparameters
#     params = {
#         'n_estimators' : np.arange(60,160,20),
#          'learning_rate' : [.05,.1,.15],
#         'max_depth' : [3,6,9],
#         'subsample': [.7,.9,1],
#         'max_features' : ['sqrt']
#     }

#     gbr_model = GridSearchCV(GradientBoostingRegressor(),param_grid=params,cv=5)
#     gbr_model.fit(X_train_train_sc[:,skb.get_support()],y_train)
#     features['n_features'].append(n_features)
#     features['max_depth'].append(gbr_model.best_params_['max_depth'])
#     features['learning_rate'].append(gbr_model.best_params_['learning_rate'])
#     features['n_estimators'].append(gbr_model.best_params_['n_estimators'])
#     features['max_features'].append(gbr_model.best_params_['max_features'])
#     features['subsample'].append(gbr_model.best_params_['subsample'])
#     rms = np.sqrt(np.abs(cross_val_score(gbr_model.best_estimator_,X_train_train_sc[:,skb.get_support()],y_train,
#         cv=5,scoring='neg_mean_squared_error')))
#     features['rms_mean'].append(rms.mean())
#     features['rms_std'].append(rms.std())
#     y_pred = gbr_model.predict(X_train_test_sc[:,skb.get_support()])
#     features['test_score'].append(np.sqrt(mean_squared_error(y_test,y_pred)))
# features = pd.DataFrame(features)

Performing grid search using 5 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 10 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 15 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 20 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 25 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 30 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 35 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 40 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 45 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 50 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 55 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 60 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 65 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [57]:
# features['test-train_score'] = abs(features.test_score - features.rms_mean)
# features['max_features'] = 'sqrt'
# best_model = features.loc[features['test-train_score']==features['test-train_score'].min()]
# skb = SelectKBest(f_regression,best_model['n_features'].values[0])
# skb.fit(X_train_train_sc,y_train)
# model = GradientBoostingRegressor(
#     learning_rate = best_model['learning_rate'].values[0],
#     n_estimators = best_model['n_estimators'].values[0],
#     subsample = best_model['subsample'].values[0],
#     max_features = best_model['max_features'].values[0],
#     max_depth = best_model['max_depth'].values[0]
# )
# model.fit(X_train_train_sc[:,skb.get_support()],y_train)
# #Create arrays to construct prediction file
# ids = test.index.values.reshape(-1,1)
# preds = model.predict(X_test_sc[:,skb.get_support()]).reshape(-1,1)

# #Create a prediction DF and convert ID back to int
# pred_df = pd.DataFrame(np.hstack(([ids,preds])),columns=['Id','SalePrice'])
# pred_df.Id = pred_df.Id.astype(int)

# #Write to csv
# pred_df.to_csv('gbr_predictions_1106_try2.csv',index=False)

# KNN Regressor

In [46]:
from sklearn.neighbors import KNeighborsRegressor

In [47]:
# features = {
#     'n_features':[],
#     'n_neighbors':[],
#     'rms_mean':[],
#     'rms_std':[],
#     'test_score':[]
# }

# for n_features in np.arange(5,60,5):
#     print('Performing grid search using {} features'.format(n_features))
#     #Select top N features
#     skb = SelectKBest(f_regression,n_features)
#     skb.fit(X_train_train_sc,y_train)
#     knr = GridSearchCV(KNeighborsRegressor(),param_grid={'n_neighbors':[3,5,7,11]})
#     knr.fit(X_train_train_sc[:,skb.get_support()],y_train)
#     features['n_features'].append(n_features)
#     features['n_neighbors'].append(knr.best_params_['n_neighbors'])
#     rms = np.sqrt(np.abs(cross_val_score(knr.best_estimator_,X_train_train_sc[:,skb.get_support()],y_train,
#         cv=5,scoring='neg_mean_squared_error')))
#     features['rms_mean'].append(rms.mean())
#     features['rms_std'].append(rms.std())
#     y_pred = knr.predict(X_train_test_sc[:,skb.get_support()])
#     features['test_score'].append(np.sqrt(mean_squared_error(y_test,y_pred)))
# features = pd.DataFrame(features)
# features['test-train_score'] = features.test_score - features.rms_mean

Performing grid search using 5 features
Performing grid search using 10 features

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)



Performing grid search using 15 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 20 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 25 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 30 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 35 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 40 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 45 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 50 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 55 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [48]:
# features

Unnamed: 0,n_features,n_neighbors,rms_mean,rms_std,test_score,test-train_score
0,5,11,32081.847961,985.925993,34318.725579,2236.877618
1,10,7,29327.162501,1142.836307,28904.359979,-422.802522
2,15,7,29449.019323,2015.93991,28163.71834,-1285.300984
3,20,7,29791.352163,1896.236593,26819.344417,-2972.007746
4,25,7,30565.684967,1559.872829,26636.394424,-3929.290543
5,30,11,32596.142457,1951.605213,31182.83901,-1413.303447
6,35,5,32752.853447,1781.995102,34288.494886,1535.64144
7,40,3,33029.196722,1808.844335,34200.602549,1171.405827
8,45,3,32888.997425,2076.722708,36069.831139,3180.833714
9,50,3,32554.758613,2007.712209,33012.609937,457.851324


# Random Forest Regressor

In [89]:
features = {
    'n_features':[],
    'rms_mean':[],
    'rms_std':[],
    'test_score':[]
}

#Feature selection and grid search for Gradient Boost Regressor
for n_features in np.arange(5,70,5):
    print('Performing grid search using {} features'.format(n_features))
    #Select top N features
    skb = SelectKBest(f_regression,n_features)
    skb.fit(X_train_train_sc,y_train)

    #Do grid search to find best model hyperparameters
    rf_model = RandomForestRegressor(n_estimators=60,max_depth=20,max_features='sqrt')
    rf_model.fit(X_train_train_sc[:,skb.get_support()],y_train)
    features['n_features'].append(n_features)
    rms = np.sqrt(np.abs(cross_val_score(rf_model,X_train_train_sc[:,skb.get_support()],y_train,
        cv=5,scoring='neg_mean_squared_error')))
    features['rms_mean'].append(rms.mean())
    features['rms_std'].append(rms.std())
    y_pred = rf_model.predict(X_train_test_sc[:,skb.get_support()])
    features['test_score'].append(np.sqrt(mean_squared_error(y_test,y_pred)))
features = pd.DataFrame(features)

Performing grid search using 5 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 10 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 15 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 20 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 25 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 30 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 35 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 40 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 45 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 50 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 55 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 60 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Performing grid search using 65 features


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [91]:
features['test-train_score'] = abs(features.test_score - features.rms_mean)
features
best_model = features.loc[features['test-train_score']==features['test-train_score'].min()]
skb = SelectKBest(f_regression,best_model['n_features'].values[0])
skb.fit(X_train_train_sc,y_train)
rf_model = RandomForestRegressor(n_estimators=60,max_depth=20,max_features='sqrt')
rf_model.fit(X_train_train_sc[:,skb.get_support()],y_train)
#Create arrays to construct prediction file
ids = test.index.values.reshape(-1,1)
preds = rf_model.predict(X_test_sc[:,skb.get_support()]).reshape(-1,1)

#Create a prediction DF and convert ID back to int
pred_df = pd.DataFrame(np.hstack(([ids,preds])),columns=['Id','SalePrice'])
pred_df.Id = pred_df.Id.astype(int)

#Write to csv
pred_df.to_csv('rf_predictions_1106.csv',index=False)

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
