# CIS400 (Machine Learning) Course Project

In [51]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

## Load in Train Data.

In [52]:
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.00,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.00,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.00,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.00,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.00,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.00,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.00,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.00,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.00,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## Load in Test Data.

In [53]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.00,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.00,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.00,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.00,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.00,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.00,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.00,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.00,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.00,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [54]:
train_data = train_data.drop('Id', axis=1)

In [55]:
print(train_data['SalePrice'].describe())

count     1,460.00
mean    180,921.20
std      79,442.50
min      34,900.00
25%     129,975.00
50%     163,000.00
75%     214,000.00
max     755,000.00
Name: SalePrice, dtype: float64


## Combine training and testing data.

In [57]:
y_train = train_data['SalePrice']
test_id = test_data['Id']
all_data = pd.concat([train_data, test_data], axis=0, sort=False)
all_data = all_data.drop(['Id', 'SalePrice'], axis=1)
all_data.shape
all_data.columns

(2919, 79)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [58]:
Total = all_data.isnull().sum().sort_values(ascending=False)
percent = (all_data.isnull().sum() / all_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([Total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Unnamed: 0,Total,Percent
PoolQC,2909,1.0
MiscFeature,2814,0.96
Alley,2721,0.93
Fence,2348,0.8
MasVnrType,1766,0.61
FireplaceQu,1420,0.49
LotFrontage,486,0.17
GarageYrBlt,159,0.05
GarageFinish,159,0.05
GarageQual,159,0.05


In [59]:
all_data.drop((missing_data[missing_data['Total'] > 5]).index, axis=1, inplace=True)
print(all_data.isnull().sum().max())

4


## Filling Missing Values (NaN).

In [60]:
numeric_missed = ['BsmtFinSF1',
                  'BsmtFinSF2',
                  'BsmtUnfSF',
                  'TotalBsmtSF',
                  'BsmtFullBath',
                  'BsmtHalfBath',
                  'GarageArea',
                  'GarageCars']

for feature in numeric_missed:
    all_data[feature] = all_data[feature].fillna(0)

In [61]:
categorical_missed = ['Exterior1st',
                  'Exterior2nd',
                  'SaleType',
                  'MSZoning',
                   'Electrical',
                     'KitchenQual']

for feature in categorical_missed:
    all_data[feature] = all_data[feature].fillna(all_data[feature].mode()[0])

In [62]:
all_data['Functional'] = all_data['Functional'].fillna('Typ')

In [63]:
all_data.drop(['Utilities'], axis=1, inplace=True)

In [64]:
all_data.isnull().sum().max()

0

In [65]:
all_data = pd.get_dummies(all_data)
all_data.head()
all_data.shape

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,8450,7,5,2003,2003,706.0,0.0,150.0,856.0,...,False,False,False,True,False,False,False,False,True,False
1,20,9600,6,8,1976,1976,978.0,0.0,284.0,1262.0,...,False,False,False,True,False,False,False,False,True,False
2,60,11250,7,5,2001,2002,486.0,0.0,434.0,920.0,...,False,False,False,True,False,False,False,False,True,False
3,70,9550,7,5,1915,1970,216.0,0.0,540.0,756.0,...,False,False,False,True,True,False,False,False,False,False
4,60,14260,8,5,2000,2000,655.0,0.0,490.0,1145.0,...,False,False,False,True,False,False,False,False,True,False


(2919, 218)

In [66]:
X_train =all_data[:len(y_train)]
X_train.shape
X_test = all_data[len(y_train):]
X_test.shape


(1460, 218)

(1459, 218)

## Linear Regression Model

In [67]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_score = linear_model.score(X_train, y_train)

print(f"Baseline score for training data (linear model): {linear_score}")

sales_predictions_linear_model = linear_model.predict(X_test)


Baseline score for training data (linear model): 0.920500941501453


In [68]:
param_grid = {
    'fit_intercept': [True, False]
}

grid_search_linear = GridSearchCV(estimator=linear_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_linear.fit(X_train, y_train)

best_linear_model = grid_search_linear.best_estimator_
sales_predictions_linear_model = best_linear_model.predict(X_test)

best_linear_score = best_linear_model.score(X_train, y_train)
print("Calculated Model: ", best_linear_model)
print(f"Best score for training data (Linear model): {best_linear_score}")

Calculated Model:  LinearRegression(fit_intercept=False)
Best score for training data (Linear model): 0.9205011531740586


In [69]:
sub_linear = pd.DataFrame()
sub_linear['Id'] = test_id
sub_linear['SalePrice'] = sales_predictions_linear_model
sub_linear.head()
sub_linear.to_csv('linear_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,118288.7
1,1462,160499.15
2,1463,187707.81
3,1464,195239.73
4,1465,220081.48


## Lasso Regression Model

In [70]:
lasso_model = Lasso(random_state=42)
lasso_model.fit(X_train, y_train)

lasso_score = lasso_model.score(X_train, y_train)

print(f"Baseline score for training data (Lasso model): {lasso_score}")

sales_predictions_lasso_model = lasso_model.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


Baseline score for training data (Lasso model): 0.9204858563241928


In [71]:
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_intercept': [True, False] 
}

grid_search_lasso = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_lasso.fit(X_train, y_train)

best_lasso_model = grid_search_lasso.best_estimator_
sales_predictions_rf_model = best_lasso_model.predict(X_test)

best_lasso_score = best_lasso_model.score(X_train, y_train)
print("Calculated Model: ", best_lasso_model)
print(f"Best score for training data (Lasso model): {best_lasso_score}")

  model = cd_fast.enet_coordinate_descent(


Calculated Model:  Lasso(alpha=10.0, fit_intercept=False, random_state=42)
Best score for training data (Lasso model): 0.9181444993732942


In [72]:
sub_lasso = pd.DataFrame()
sub_lasso['Id'] = test_id
sub_lasso['SalePrice'] = sales_predictions_lasso_model
sub_lasso.head()
sub_lasso.to_csv('lasso_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,118393.83
1,1462,159943.05
2,1463,187602.75
3,1464,195249.33
4,1465,220003.48


## Bagging Regressor Model

In [73]:
bagging_model = BaggingRegressor(base_estimator=LinearRegression(), random_state=42)
bagging_model.fit(X_train, y_train)

bagging_score = bagging_model.score(X_train, y_train)

print(f"Baseline score for training data (Bagging model): {bagging_score}")

sales_predictions_bagging_model = bagging_model.predict(X_test)




Baseline score for training data (Bagging model): -48177.15022627854


In [74]:
param_grid = {
    'base_estimator__fit_intercept': [True, False],
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}

grid_search_bagging = GridSearchCV(estimator=bagging_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_bagging.fit(X_train, y_train)

best_bagging_model = grid_search_bagging.best_estimator_
sales_predictions_bagging_model = best_bagging_model.predict(X_test)

best_bagging_score = best_bagging_model.score(X_train, y_train)
print("Calculated Model: ", best_bagging_model)
print(f"Best score for training data (Bagging model): {best_bagging_score}")



Calculated Model:  BaggingRegressor(base_estimator=LinearRegression(fit_intercept=False),
                 max_features=0.5, max_samples=0.5, n_estimators=100,
                 random_state=42)
Best score for training data (Bagging model): -77583701004.38718


In [75]:
sub_bagging = pd.DataFrame()
sub_bagging['Id'] = test_id
sub_bagging['SalePrice'] = sales_predictions_bagging_model
sub_bagging.head()
sub_bagging.to_csv('bagging_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,113115.18
1,1462,166332.81
2,1463,184023.37
3,1464,197871.17
4,1465,211903.4


## Random Forest Regression Model

In [76]:
rf_model = RandomForestRegressor(random_state=42, max_features='sqrt')
rf_model.fit(X_train, y_train)

rf_score = rf_model.score(X_train, y_train)
print(f"Baseline score for training data (Random Forest model): {rf_score}")

Baseline score for training data (Random Forest model): 0.9793143499024441


In [77]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_
sales_predictions_rf_model = best_rf_model.predict(X_test)

best_rf_score = best_rf_model.score(X_train, y_train)
print("Calculated Model: ", best_rf_model)
print(f"Best score for training data (Random Forest model): {best_rf_score}")


Calculated Model:  RandomForestRegressor(max_features='sqrt', n_estimators=50, random_state=42)
Best score for training data (Random Forest model): 0.97737596433221


In [78]:
sub_rf = pd.DataFrame()
sub_rf['Id'] = test_id
sub_rf['SalePrice'] = sales_predictions_rf_model
sub_rf.head()
sub_rf.to_csv('rf_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,125384.5
1,1462,155856.0
2,1463,181865.56
3,1464,191383.0
4,1465,195283.02


## Gradient Boosting Regression Model

In [79]:
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

gb_model_score = gb_model.score(X_train, y_train)

print(f"Baseline score for training data (Gradient Boosting model): {gb_model_score}")

sales_predictions_gb_model = gb_model.predict(X_test)

Baseline score for training data (Gradient Boosting model): 0.9649663536782351


In [80]:
param_grid = {
    'n_estimators': [10, 50, 100, 150],
    'max_depth': [None, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

best_gb_model = grid_search_gb.best_estimator_
sales_predictions_gb_model = best_gb_model.predict(X_test)

best_gb_score = best_gb_model.score(X_train, y_train)
print("Calculated Model: ", best_gb_model)
print(f"Best score for training data (Gradien Boosting model): {best_gb_score}")

Calculated Model:  GradientBoostingRegressor(learning_rate=0.2, max_depth=5, min_samples_split=6,
                          n_estimators=150, random_state=42)
Best score for training data (Gradien Boosting model): 0.9978600115531687


In [81]:
sub_gb = pd.DataFrame()
sub_gb['Id'] = test_id
sub_gb['SalePrice'] = sales_predictions_gb_model
sub_gb.head()
sub_gb.to_csv('gb_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,127126.63
1,1462,155778.98
2,1463,189229.56
3,1464,196102.07
4,1465,185831.56


# FINAL MODEL (Gradient Boosting Regression)

In [82]:
gb_model_final = GradientBoostingRegressor(n_estimators=150, random_state=42, min_samples_split=10, min_samples_leaf=10, max_depth=5)
gb_model_final.fit(X_train, y_train)

gb_model_score_final = gb_model_final.score(X_train, y_train)

print(f"Final score for training data (Gradient Boosting model): {gb_model_score_final}")

sales_predictions_gb_model_final = gb_model_final.predict(X_test)

Final score for training data (Gradient Boosting model): 0.9850274131680193


In [83]:
sub_gb_final = pd.DataFrame()
sub_gb_final['Id'] = test_id
sub_gb_final['SalePrice'] = sales_predictions_gb_model_final
sub_gb_final.head()
sub_gb_final.to_csv('FINAL_gb_regression_submission.csv',index=False)

Unnamed: 0,Id,SalePrice
0,1461,119097.21
1,1462,154983.08
2,1463,181944.86
3,1464,196368.64
4,1465,194635.25
