# House prices

Following the kaggle machine learning tutorial to improve solution for 'Housing Prices Competition for Kaggle Learn Users'

In [510]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib inline 



In [511]:
train_file = './train.csv'
test_file = './test.csv'
data = pd.read_csv(train_file)

In [512]:
# Create target object and call it y
y = data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = data[features]

In [513]:
# Random forest with only numerical values
pipeline1 = make_pipeline(SimpleImputer(), RandomForestRegressor(random_state=1, n_estimators=100))
scores1 = cross_val_score(pipeline1, X, y, scoring='neg_mean_absolute_error',cv=10)
print(abs(scores1.mean()))

22731.7721267


In [514]:
#features_cat = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd','LotShape']
#X_cat = data[features_cat]
#X_cat = pd.get_dummies(X_cat)


In [530]:
# Use all features here
X_cat = data.drop(['SalePrice'], axis=1)
X_cat = pd.get_dummies(X_cat)

In [531]:
pipeline2 = make_pipeline(SimpleImputer(), RandomForestRegressor(random_state=1, n_estimators=100))
scores2 = cross_val_score(pipeline2, X_cat, y, scoring='neg_mean_absolute_error', cv=10)
print(abs(scores2.mean()))

17561.3441986


In [532]:
#X_cat.head()

### Trying XGBoost

In [552]:
# need to find the number of estimators again after switching to the scikit-learn version 

xgmodel = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05)
pipeline3 = make_pipeline(SimpleImputer(), xgmodel)
scores3 = cross_val_score(pipeline3, X, y, scoring='neg_mean_absolute_error', cv=10)
print(abs(scores3.mean()))


23467.2210312


In [534]:
# use this function to figure out the number of estimators in the xgboost model
def get_score_model_xg(xgmodel, train_X, val_X, train_y, val_Y):

    tr_X, ts_X, tr_y, ts_y = train_test_split(train_X, train_y, test_size=0.33, random_state=1)
    xgmodel.fit(tr_X, tr_y, n_iter_no_change=5, 
             eval_set=[(ts_X, ts_y)], verbose=False)
    
    best_ntree.append(xgmodel.best_ntree_limit)
    
    
    # Make validation predictions and calculate mean absolute error
    val_predictions = xgmodel.predict(val_X)
    val_mae = mean_absolute_error(val_predictions, val_y)
    return val_mae

### Train the model on all the data

In [542]:
# Train the model I have so far with all the data
#modelxg = XGBRegressor(n_estimators=35, learning_rate=0.05)
#model.fit(X,y)


#xgbm = GradientBoostingRegressor(n_estimators=100,learning_rate=0.05)
#xgbm.fit(X,y)


pipl = make_pipeline(SimpleImputer(), RandomForestRegressor(random_state=1, n_estimators=100))
#pipl.fit(X_cat,y) # doing this later, since we need to select the common columns from train and test data


### Load the test data, process it, predict and output 

In [536]:
# load the test data
test_data = pd.read_csv(test_file)

In [537]:
# any processing needed on the data
#test_X = test_data[features]
test_X_cat = pd.get_dummies(test_data)
#test_X_cat.head()

In [538]:
# make sure we have the same columns in both sets, otherwise errors
final_train_X_cat, final_test_X_cat = X_cat.align(test_X_cat, join='left', axis=1)

In [539]:
# train and make predictions
pipl.fit(final_train_X_cat,y)
test_preds = pipl.predict(final_test_X_cat)

In [554]:
# Partial dependence plots for sanity checks
# only works for gradient boosted stuff
# doesn't seem to be working with the pipeline :(
pipeline3.fit(final_train_X_cat,y)
my_plots = plot_partial_dependence(pipeline3,       
                                   features=[6], # column numbers of plots we want to show
                                   X=X,            # raw predictors data.
                                   feature_names=['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'], # labels on graphs
                                   grid_resolution=8) # number of values to plot on x axis

ValueError: gbrt has to be an instance of BaseGradientBoosting

In [555]:
# write to file
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)