In [25]:
import pandas as pd
from sklearn import preprocessing
%matplotlib inline
from operator import itemgetter
from sklearn import model_selection


# Models
from sklearn import linear_model
from sklearn.feature_selection import SelectPercentile
from sklearn.ensemble import RandomForestRegressor

In [26]:
# Import Data
train_imported_data=pd.read_csv('train.csv',index_col=0)
# drop features because lot of NAs
train_imported_data=train_imported_data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1)

# Prepare Data
# Numeric Columns
numericcol_df=train_imported_data[train_imported_data.describe().columns]
numericcol_df=numericcol_df.dropna()
numericcol_df['Age']=2018 - numericcol_df['YearBuilt']
numericcol_df['AgeafterRemod']=2018 - numericcol_df['YearRemodAdd']
numericcol_df=numericcol_df.drop(['YearBuilt','YearRemodAdd'],axis=1)

# non numeric columns - One Hot Encode
nonnumericcols=train_imported_data.describe(include=['O']).columns
nonnumeric_df=pd.get_dummies(train_imported_data[nonnumericcols],columns=nonnumericcols)

# Combine numerical and categorical columns in one dataframe
bigdf=numericcol_df.join(nonnumeric_df)

In [27]:
# Check model performance
bigdf_traincols=[col for col in bigdf.columns if col !='SalePrice']

lr=linear_model.LinearRegression()
rf=RandomForestRegressor()
scores_lr=model_selection.cross_val_score(X=bigdf[bigdf_traincols],y=bigdf.SalePrice,cv=5,estimator=lr)
scores_rf=model_selection.cross_val_score(X=bigdf[bigdf_traincols],y=bigdf.SalePrice,cv=5,estimator=rf)
print('LinearRegression:',round(scores_lr.mean(),2),'\n')
print('RandomForest:',round(scores_rf.mean(),2),'\n')

LinearRegression: 0.76 

RandomForest: 0.84 



In [28]:
newcolsforgrouping=[col.split('_')[0] for col in bigdf_traincols]

rf.fit(bigdf[bigdf_traincols],y=bigdf.SalePrice)
importance_df=pd.DataFrame({'colname':newcolsforgrouping,
             'importancescore':rf.feature_importances_*1000})
importance_df=importance_df.groupby('colname').sum().sort_values(by='importancescore',ascending=False)

impcolsonly=[col for col in bigdf_traincols if col.split('_')[0] in importance_df.index[:30]]

num_cols_array=range(10,100,10)
for num_cols in num_cols_array:

    impcolsonly=[col for col in bigdf_traincols if col.split('_')[0] in importance_df.index[:num_cols]]
    lr=linear_model.LinearRegression()
    rf=RandomForestRegressor()
    scores_lr=model_selection.cross_val_score(X=bigdf[impcolsonly],y=bigdf.SalePrice,cv=10,estimator=lr)
    scores_rf=model_selection.cross_val_score(X=bigdf[impcolsonly],y=bigdf.SalePrice,cv=10,estimator=rf)
    print('Number of columns: ',num_cols)
    print('LinearRegression:',round(scores_lr.mean(),3),'\n')
    print('RandomForest:',round(scores_rf.mean(),3),'\n')

Number of columns:  10
LinearRegression: 0.759 

RandomForest: 0.838 

Number of columns:  20
LinearRegression: 0.798 

RandomForest: 0.851 

Number of columns:  30
LinearRegression: 0.809 

RandomForest: 0.823 

Number of columns:  40
LinearRegression: 0.798 

RandomForest: 0.838 

Number of columns:  50
LinearRegression: 0.804 

RandomForest: 0.857 

Number of columns:  60
LinearRegression: 0.796 

RandomForest: 0.833 

Number of columns:  70
LinearRegression: -164022914.56 

RandomForest: 0.835 

Number of columns:  80
LinearRegression: -180854165.683 

RandomForest: 0.846 

Number of columns:  90
LinearRegression: -180854165.683 

RandomForest: 0.839 



In [29]:
# As we can see the performance is highest with 40 columns, so we will selct those 40 columns and futrther tune 
# them to improve the model
importance_df.index[:40]

Index(['OverallQual', 'GrLivArea', '2ndFlrSF', 'BsmtFinSF1', 'TotalBsmtSF',
       '1stFlrSF', 'GarageArea', 'GarageYrBlt', 'GarageFinish', 'LotArea',
       'FullBath', 'BsmtUnfSF', 'Age', 'AgeafterRemod', 'WoodDeckSF',
       'LotFrontage', 'Neighborhood', 'TotRmsAbvGrd', 'Condition1',
       'OverallCond', 'GarageType', 'MasVnrArea', 'ExterQual', 'SaleType',
       'OpenPorchSF', 'MoSold', 'KitchenQual', 'ScreenPorch', 'LotConfig',
       'BsmtQual', 'Exterior2nd', 'BsmtFinType1', 'CentralAir',
       'SaleCondition', 'HalfBath', 'BsmtExposure', 'Fireplaces',
       'Exterior1st', 'RoofStyle', 'YrSold'],
      dtype='object', name='colname')

In [95]:
## TRAINING THE MODEL

# Prepping the data
def preprocess_data(filename):
    df=pd.read_csv(filename,index_col=0)
    df['Age']=2018 - df['YearBuilt']
    df['AgeafterRemod']=2018 - df['YearRemodAdd']
    df['MonthSinceSold']=[round((datetime.date(2018,1,1) -datetime.date(item[0],item[1],1)).days/30,2) for item in list(zip(df.YrSold.values.tolist(),df.MoSold.values.tolist()))]
    df= df[importance_df.index[:40]]
    df=df.drop(['YrSold','MoSold'],axis=1)
    cat_cols=df.describe(include=['O']).columns
    df=pd.get_dummies(df)
    
#     df['LotFrontage']=df['LotFrontage'].fillna(df['LotFrontage'].mean())
#     df['GarageYrBlt']=df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())
#     df['MasVnrArea']=df['MasVnrArea'].fillna(df['MasVnrArea'].mean())
    
    for eachcol in df.columns:
        df[eachcol]=df[eachcol].fillna(df[eachcol].mean())
    return df


In [96]:
preppeddata=preprocess_data('train.csv')
traindata=pd.read_csv('train.csv',index_col=0)
preppeddata=preppeddata.join(traindata[['SalePrice']],how='inner')
traincols=[x for x in preppeddata if x !='SalePrice']

In [115]:
# Random Forest

rf_1=RandomForestRegressor()
# scores=model_selection.cross_val_score(X=preppeddata[traincols],y=preppeddata.SalePrice,cv=5,estimator=rf_1)

X_train, X_test, y_train, y_test=model_selection.train_test_split(preppeddata[traincols],
                                                                  preppeddata.SalePrice,
                                                                 test_size=0.4,
                                                                 )
rf_1.fit(X_train,y_train)
print(rf_1.score(X_test,y_test))

# Checking the score on Test
testdata=preprocess_data('test.csv')
testdata['Exterior2nd_Other']=0
testdata['Exterior1st_ImStucc']=0
testdata['Exterior1st_Stone']=0

testdata=testdata[traincols]

saleprice_test_rf=rf_1.predict(X=testdata)

submission_df_rf=pd.DataFrame({'Id':testdata.index,
                           'SalePrice':saleprice_test_rf})
submission_df_rf.to_csv('submission_rf.csv',index=None)

0.8373804197068248


In [118]:
# XGBOOST

import xgboost
xgmod=xgboost.XGBRegressor()
scores_xg=model_selection.cross_val_score(X=preppeddata[traincols],y=preppeddata.SalePrice,cv=10,estimator=xgmod)
print(scores_xg.mean())

xgmod.fit(X=preppeddata[traincols],y=preppeddata.SalePrice)
saleprice_test_xg=xgmod.predict(testdata)

submission_df_xg=pd.DataFrame({'Id':testdata.index,
                           'SalePrice':saleprice_test_xg})
submission_df_xg.to_csv('submission_xg.csv',index=None)

0.8891526420764224


In [113]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5],'n_estimators': [100,500,1000]}

gsmodel=model_selection.GridSearchCV(estimator=xgboost.XGBRegressor(),n_jobs=1,cv=2,param_grid=cv_params,verbose=1)
gsmodel.fit(X_train,y_train)
print(' Score:',gsmodel.score(X_test,y_test))
print(gsmodel.best_params_)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  1.4min finished


 Score: 0.848399359146077
{'max_depth': 3, 'n_estimators': 500, 'min_child_weight': 1}


In [121]:
corr_df=preppeddata[traincols].corr()

In [215]:
# From here we will remove correlated columns and see if that improves things, which we realize later doesnt improve things much

#Positively Correlated
for each in corr_df.columns:
    
    if (corr_df[(corr_df[each]>0.7) & (corr_df[each]<1)][[each]]).empty==False:
#         print('ColName: ',each)
        cordf_col=corr_df[(corr_df[each]>0.7) & (corr_df[each]<1)][[each]]
        print(cordf_col.index[0],cordf_col.columns[0],round(cordf_col[cordf_col.columns].values[0][0],4))
        print('\n')
        
# Negatively Correlated
for each in corr_df.columns:
    
    if (corr_df[(corr_df[each]<-0.7)][[each]]).empty==False:

        cordf_col=corr_df[(corr_df[each]<-0.7)][[each]]
        print(cordf_col)

        print('\n')

     GarageYrBlt
Age    -0.780555


                  Age
GarageYrBlt -0.780555


                   GarageType_Attchd
GarageType_Detchd          -0.729272


                   GarageType_Detchd
GarageType_Attchd          -0.729272


              ExterQual_Gd
ExterQual_TA     -0.906121


              ExterQual_TA
ExterQual_Gd     -0.906121


             SaleType_New
SaleType_WD      -0.77368


                       SaleType_WD
SaleType_New             -0.773680
SaleCondition_Partial    -0.769559


                KitchenQual_Gd
KitchenQual_TA       -0.824457


                KitchenQual_TA
KitchenQual_Gd       -0.824457


                  LotConfig_Corner
LotConfig_Inside         -0.752677


                  LotConfig_Inside
LotConfig_Corner         -0.752677


             BsmtQual_Gd
BsmtQual_TA    -0.766391


             BsmtQual_TA
BsmtQual_Gd    -0.766391


              CentralAir_N
CentralAir_Y          -1.0


              CentralAir_Y
CentralAir_N          -1.0


     

In [238]:
poscolstoremove=['TotRmsAbvGrd','1stFlrSF',
              'SaleCondition_Partial'] + [col for col in traincols if col[:11]=='Exterior2nd'] 
negcolstoremove=['GarageYrBlt','GarageType_Detchd','ExterQual_TA','SaleType_New',
              'KitchenQual_TA','LotConfig_Inside','BsmtQual_TA','CentralAir_N','RoofStyle_Gable']
allcols=poscolstoremove + negcolstoremove
newtraincols=traincols.copy()
for eachcol in allcols:
    newtraincols.remove(eachcol)


In [253]:
# XGBOOST

import xgboost
xgmod=xgboost.XGBRegressor()
scores_xg=model_selection.cross_val_score(X=preppeddata[newtraincols],y=preppeddata.SalePrice,cv=10,estimator=xgmod)
print(scores_xg.mean())

xgmod.fit(X=preppeddata[traincols],y=preppeddata.SalePrice)
saleprice_test_xg=xgmod.predict(testdata)

submission_df_xg=pd.DataFrame({'Id':testdata.index,
                           'SalePrice':saleprice_test_xg})
submission_df_xg.to_csv('submission_xg.csv',index=None)

0.88234572331423


In [251]:
# Random Forest

rf_1=RandomForestRegressor()
# scores=model_selection.cross_val_score(X=preppeddata[traincols],y=preppeddata.SalePrice,cv=5,estimator=rf_1)

X_train, X_test, y_train, y_test=model_selection.train_test_split(preppeddata[newtraincols],
                                                                  preppeddata.SalePrice,
                                                                 test_size=0.4,
                                                                 )
rf_1.fit(X_train,y_train)
print(rf_1.score(X_test,y_test))

0.8447475487036408
