In [1]:
import pandas as pd
from sklearn import preprocessing
%matplotlib inline
from operator import itemgetter
from sklearn import model_selection

# Models
from sklearn import linear_model
from sklearn.feature_selection import SelectPercentile
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Import Data
train_imported_data=pd.read_csv('train.csv',index_col=0)
# drop features because lot of NAs
train_imported_data=train_imported_data.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu'],axis=1)

# Prepare Data
# Numeric Columns
numericcol_df=train_imported_data[train_imported_data.describe().columns]
numericcol_df=numericcol_df.dropna()
numericcol_df['Age']=2018 - numericcol_df['YearBuilt']
numericcol_df['AgeafterRemod']=2018 - numericcol_df['YearRemodAdd']
numericcol_df=numericcol_df.drop(['YearBuilt','YearRemodAdd'],axis=1)

# non numeric columns - One Hot Encode
nonnumericcols=train_imported_data.describe(include=['O']).columns
nonnumeric_df=pd.get_dummies(train_imported_data[nonnumericcols],columns=nonnumericcols)

# Combine numerical and categorical columns in one dataframe
bigdf=numericcol_df.join(nonnumeric_df)

In [4]:
# Check model performance
bigdf_traincols=[col for col in bigdf.columns if col !='SalePrice']

lr=linear_model.LinearRegression()
rf=RandomForestRegressor()
scores_lr=model_selection.cross_val_score(X=bigdf[bigdf_traincols],y=bigdf.SalePrice,cv=5,estimator=lr)
scores_rf=model_selection.cross_val_score(X=bigdf[bigdf_traincols],y=bigdf.SalePrice,cv=5,estimator=rf)
print('LinearRegression:',round(scores_lr.mean(),2),'\n')
print('RandomForest:',round(scores_rf.mean(),2),'\n')

LinearRegression: 0.76 

RandomForest: 0.84 



In [7]:
newcolsforgrouping=[col.split('_')[0] for col in bigdf_traincols]

rf.fit(bigdf[bigdf_traincols],y=bigdf.SalePrice)
importance_df=pd.DataFrame({'colname':newcolsforgrouping,
             'importancescore':rf.feature_importances_*1000})
importance_df=importance_df.groupby('colname').sum().sort_values(by='importancescore',ascending=False)

impcolsonly=[col for col in bigdf_traincols if col.split('_')[0] in importance_df.index[:30]]

num_cols_array=range(10,100,10)
for num_cols in num_cols_array:

    impcolsonly=[col for col in bigdf_traincols if col.split('_')[0] in importance_df.index[:num_cols]]
    lr=linear_model.LinearRegression()
    rf=RandomForestRegressor()
    scores_lr=model_selection.cross_val_score(X=bigdf[impcolsonly],y=bigdf.SalePrice,cv=10,estimator=lr)
    scores_rf=model_selection.cross_val_score(X=bigdf[impcolsonly],y=bigdf.SalePrice,cv=10,estimator=rf)
    print('Number of columns: ',num_cols)
    print('LinearRegression:',round(scores_lr.mean(),3),'\n')
    print('RandomForest:',round(scores_rf.mean(),3),'\n')

Number of columns:  10
LinearRegression: 0.752 

RandomForest: 0.83 

Number of columns:  20
LinearRegression: 0.772 

RandomForest: 0.838 

Number of columns:  30
LinearRegression: 0.771 

RandomForest: 0.842 

Number of columns:  40
LinearRegression: 0.799 

RandomForest: 0.852 

Number of columns:  50
LinearRegression: 0.788 

RandomForest: 0.849 

Number of columns:  60
LinearRegression: 0.781 

RandomForest: 0.847 

Number of columns:  70
LinearRegression: 0.785 

RandomForest: 0.844 

Number of columns:  80
LinearRegression: 0.786 

RandomForest: 0.85 

Number of columns:  90
LinearRegression: 0.786 

RandomForest: 0.833 



In [12]:
# As we can see the performance is highest with 40 columns, so we will selct those 40 columns and futrther tune 
# them to improve the model
importance_df.index[:40]

Index(['OverallQual', 'GrLivArea', '2ndFlrSF', 'BsmtFinSF1', 'TotalBsmtSF',
       'FullBath', '1stFlrSF', 'LotArea', 'TotRmsAbvGrd', 'GarageArea',
       'AgeafterRemod', 'OpenPorchSF', 'LotFrontage', 'BsmtUnfSF', 'BsmtQual',
       'Exterior2nd', 'GarageType', 'WoodDeckSF', 'OverallCond', 'GarageCars',
       'MoSold', 'Age', 'BsmtExposure', 'CentralAir', 'GarageFinish',
       'LotShape', 'GarageYrBlt', 'MasVnrArea', 'LotConfig', 'RoofStyle',
       'Neighborhood', 'SaleCondition', 'MasVnrType', 'ExterQual',
       'KitchenQual', 'Fireplaces', 'MSZoning', 'HalfBath', 'BedroomAbvGr',
       'ScreenPorch'],
      dtype='object', name='colname')

In [71]:
## TRAINING THE MODEL

# Prepping the data
def preprocess_data(filename):
    df=pd.read_csv(filename,index_col=0)
    df['Age']=2018 - df['YearBuilt']
    df['AgeafterRemod']=2018 - df['YearRemodAdd']
    df= df[importance_df.index[:40]]
    cat_cols=df.describe(include=['O']).columns
    df=pd.get_dummies(df)
    
    df['LotFrontage']=df['LotFrontage'].fillna(df['LotFrontage'].mean())
    df['GarageYrBlt']=df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())
    df['MasVnrArea']=df['MasVnrArea'].fillna(df['MasVnrArea'].mean())
    
    return df


In [72]:
preppeddata=preprocess_data('train.csv')
traindata=pd.read_csv('train.csv',index_col=0)
preppeddata=preppeddata.join(traindata[['SalePrice']],how='inner')
traincols=[x for x in preppeddata if x !='SalePrice']

In [73]:
rf_1=RandomForestRegressor()
scores=model_selection.cross_val_score(X=preppeddata[traincols],y=preppeddata.SalePrice,cv=5,estimator=rf_1)

In [75]:
scores.mean()

0.8463032727135553