In [22]:
# now read in the new function
from Helpers import read_dataframe, x_y_split
from Preprocessing import preprocessor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading Training Data

In [23]:
train = read_dataframe(path = 'train.csv')
x_train, y_train = x_y_split(df=train)
x_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


## Building a Pipeline 
- The pipline is made from the pre-processing object and the GradientBoosting model from the previous step.

In [24]:
cols_to_filter=['1stFlrSF','3SsnPorch','Alley','BldgType','BsmtFinSF2','BsmtFinType1','BsmtFinType2','BsmtHalfBath',
                'Condition2','Exterior1st','Exterior2nd', 'Fence', 'Functional', 'GarageCars', 'GarageCond',
                  'GarageType','GarageYrBlt','HeatingQC','HouseStyle','Id',  'KitchenAbvGr', 'LandContour',
                  'LandSlope',  'LowQualFinSF','MSSubClass', 'MiscFeature', 'MiscVal', 'MoSold', 'PavedDrive',
                   'PoolArea','PoolQC',  'RoofMatl','RoofStyle', 'TotRmsAbvGrd', 'Utilities', 'YrSold']

cols_to_impute_minus_1 = ['LotFrontage','FireplaceQu', 'GarageFinish', 'GarageQual', 'MasVnrType','Electrical']               

In [25]:
p = preprocessor(cols_to_filter, cols_to_impute_minus_1)        

In [26]:
gbm = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.05, max_depth = 5, max_features = 50)

In [27]:
my_model = Pipeline([('preprocessor', p), ('regressor', gbm)])

In [28]:
my_model.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 <Preprocessing.preprocessor object at 0x0000020FBC4EB6A0>),
                ('regressor',
                 GradientBoostingRegressor(learning_rate=0.05, max_depth=5,
                                           max_features=50,
                                           n_estimators=1000))])

## Predict Test Dataset

In [29]:
x_test = pd.read_csv('test.csv')

In [30]:
predictions = my_model.predict(x_test)

## Create a Submission file to Upload to Kaggle

In [34]:
test_Id = x_test.loc[:,'Id']

In [36]:
submission_file = pd.DataFrame(test_Id, columns = ['Id'])
submission_file['SalePrice'] = predictions
submission_file

Unnamed: 0,Id,SalePrice
0,1461,129244.536095
1,1462,161658.243876
2,1463,183011.612097
3,1464,190559.171617
4,1465,186620.033344
...,...,...
1454,2915,79269.965503
1455,2916,82063.714533
1456,2917,183891.232953
1457,2918,110928.786663


In [40]:
submission_file.to_csv('submission_file.csv', index = False)