In [1]:
import pandas as pd 

import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
f_train=pd.read_csv("../Resources/thirtythree.csv")

## Created functions and pipeline 

In [3]:
#label drops month and year sold,and label encodes each column with string values 
def label (df):
    df.drop(columns=['MoSold','YrSold'], inplace=True)
   
    le=LabelEncoder()
    
    label_encode = df.select_dtypes(exclude=['number']).columns.tolist()
    

    for x in label_encode:
        col = df[x]
        label = le.fit_transform(col)
        df[x]=label
      
    return df  

In [4]:
#one hot is designed to pass through each column with ordinal data and apply pd.get_dummies 

def onehot(df):
    onehot = ['MSSubClass',
 'OverallQual',
 'OverallCond',
 'Neighborhood',
 'HouseStyle',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'BsmtQual',
 'BsmtFinType1',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish']
    for x in onehot: 
        
        df[x]=df[x].astype('object')
        one_hot = pd.get_dummies(df[x])
        df.drop(columns=[x], inplace=True)
        one_hot = one_hot.add_prefix(x + '_')
        df=df.join(one_hot)
    return df 

In [5]:
#initializing pipeline
def pipeline(df):
    df = (df.pipe(label)
         .pipe(onehot))
    return df 

In [6]:
#after testing with different variotions I found that the Random Forest Regressor worked best without the one hot encoding. 
#Continued building model only applying the label encoder function. 
df = label(f_train)

In [7]:
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Neighborhood,HouseStyle,OverallQual,OverallCond,YearBuilt,Exterior1st,Exterior2nd,...,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,SalePrice,outdoorAreaSF
0,60,65.0,8450,5,5,7,5,2003,11,13,...,2,8,0,5,1,1,2,548,208500,61
1,20,80.0,9600,24,2,6,8,1976,7,8,...,3,6,1,4,1,1,2,460,181500,298
2,60,68.0,11250,5,5,7,5,2001,11,13,...,2,6,1,4,1,1,2,608,223500,42
3,70,60.0,9550,6,5,7,5,1915,12,15,...,2,7,1,2,5,2,3,642,140000,307
4,60,84.0,14260,15,5,8,5,2000,11,13,...,2,9,1,4,1,1,3,836,250000,276


## Creating model

In [8]:
# created X and y variable
X = df.drop('SalePrice', axis = 1)
y = df['SalePrice']

In [9]:
# import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()


In [10]:
#split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# applied GridSearchCV to test multiple n_estimators
from sklearn.model_selection import GridSearchCV 
param_grid = {'n_estimators':[100,200,300,500,600,700]}
grid = GridSearchCV(rf,param_grid, verbose=5)

In [12]:
#fit gridsearch to my training data 
grid.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ..................n_estimators=100;, score=0.890 total time=   0.6s
[CV 2/5] END ..................n_estimators=100;, score=0.867 total time=   0.6s
[CV 3/5] END ..................n_estimators=100;, score=0.848 total time=   0.6s
[CV 4/5] END ..................n_estimators=100;, score=0.740 total time=   0.6s
[CV 5/5] END ..................n_estimators=100;, score=0.872 total time=   0.6s
[CV 1/5] END ..................n_estimators=200;, score=0.887 total time=   1.3s
[CV 2/5] END ..................n_estimators=200;, score=0.874 total time=   1.2s
[CV 3/5] END ..................n_estimators=200;, score=0.845 total time=   1.3s
[CV 4/5] END ..................n_estimators=200;, score=0.736 total time=   1.3s
[CV 5/5] END ..................n_estimators=200;, score=0.871 total time=   1.3s
[CV 1/5] END ..................n_estimators=300;, score=0.888 total time=   2.0s
[CV 2/5] END ..................n_estimators=300;,

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [100, 200, 300, 500, 600, 700]},
             verbose=5)

In [13]:
#printed the results 
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 500}
0.8441960813302014


In [14]:
#fit the model using the best parameters I found to my training data 
rf = RandomForestRegressor(n_estimators=500)
rf = rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.9784801394939797

In [15]:
#created a data frame to show my results 
predictions =rf.predict(X_test).round()
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test, "difference %": round((y_test - predictions)/y_test*100,2)})

In [16]:
results.head(10)

Unnamed: 0,Prediction,Actual,difference %
741,260135.0,270000,3.65
865,116666.0,122900,5.07
339,432768.0,437154,1.0
192,142781.0,104000,-37.29
203,148448.0,145000,-2.38
816,148824.0,140000,-6.3
777,121097.0,109900,-10.19
67,243790.0,225000,-8.35
1070,314454.0,280000,-12.31
51,338698.0,385000,12.03


In [17]:
#took a look at my most important features, this will come in handy later 
sorted(zip(rf.feature_importances_, df.columns), reverse=True)

[(0.4767066764350392, 'OverallQual'),
 (0.15137710669081414, 'GrLivArea'),
 (0.0706529603388898, 'TotalBsmtSF'),
 (0.05554263508933827, 'FullBath'),
 (0.036628474245495354, 'MasVnrArea'),
 (0.026115158050939186, 'GarageArea'),
 (0.0219512146502204, 'YearBuilt'),
 (0.019068315623213927, 'LotArea'),
 (0.017298618260624198, 'GarageCars'),
 (0.015590631071646035, 'TotRmsAbvGrd'),
 (0.014517282632167085, 'BsmtQual'),
 (0.012408405091854655, 'LotFrontage'),
 (0.01030004037673841, 'Neighborhood'),
 (0.009167784157396408, 'SalePrice'),
 (0.007027412699015388, 'OverallCond'),
 (0.006695231949663561, 'GarageFinish'),
 (0.006002700399593518, 'KitchenQual'),
 (0.005738580894182276, 'BsmtFullBath'),
 (0.005039179236091832, 'Fireplaces'),
 (0.004862781974813402, 'GarageType'),
 (0.004654939977292066, 'BsmtFinType1'),
 (0.004223710842405591, 'MSSubClass'),
 (0.003766986582269537, 'FireplaceQu'),
 (0.0032932720250800807, 'Exterior2nd'),
 (0.003094581692218591, 'Exterior1st'),
 (0.002445022236611701, '

In [18]:
#took a look at the results 
results['difference %'].describe()

count    353.000000
mean      -4.996799
std       22.127261
min     -179.520000
25%       -7.990000
50%       -0.480000
75%        4.650000
max       42.130000
Name: difference %, dtype: float64

In [19]:
#single worst result with a highly over valued prediction 
results[(results['difference %']==-179.520000)]

Unnamed: 0,Prediction,Actual,difference %
1253,447236.0,160000,-179.52


##### After evaluating the results it shows that my model can be reasonably accurate, but that their are some features that were possibly dropped that could skew my results heavily. 

### Saving the model 

In [21]:
#saved my model using joblib 
import joblib 
joblib.dump(rf, "random_forest.joblib")

['random_forest.joblib']

In [22]:
#loaded the model back in and tested it 
rf_load = joblib.load("random_forest.joblib")

In [24]:
rf_load.predict(X_test).round()

array([260135., 116666., 432768., 142781., 148448., 148824., 121097.,
       243790., 314454., 338698., 174772., 375512., 230494., 204960.,
       179201., 226178., 151819., 153037., 110444., 223821., 165027.,
       145756., 166034., 187597.,  93714., 255255., 166127., 252390.,
       150383., 366323., 239882., 226853., 218493.,  78132., 210645.,
       135605., 158492., 218670., 116704., 104222., 113344.,  89345.,
       301455., 128590., 281880., 116897., 120544., 171251., 339393.,
       151358., 155921., 127840., 153488., 196910., 144792.,  97211.,
       112699., 159290., 145359., 244632., 130144., 234849., 101085.,
       101642., 142545., 345015.,  89360., 156568., 182049., 256343.,
       108147., 194875., 180212., 213370., 194844., 237394., 188604.,
       140421., 129929., 190985., 142332., 377106., 128690.,  86303.,
       311422., 167033., 279624., 215054., 131927., 132875., 126613.,
       338808., 204532., 131328., 150364., 126030., 286521., 126829.,
       199722., 1886