In [1]:
cd /Users/emilyvincett/Downloads/ThinkStats2-master/code

/Users/emilyvincett/Downloads/ThinkStats2-master/code


In [2]:
import pandas as pd
import numpy as np 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
with warnings.catch_warnings():
    warnings.simplefilter('ignore')

# Read test and train files into the program

In [3]:
train = pd.read_csv('/Users/emilyvincett/Downloads/home-data-for-ml-course/train.csv')
test = pd.read_csv('/Users/emilyvincett/Downloads/home-data-for-ml-course/test.csv')

In [4]:
# Combine both sets together for some preprocessing and create a copy
file = train.append(test)
file = file.reset_index(drop=True)
file2 = file.copy()

# Class and Function Definitions

In [5]:
# These functions will be used to convert categorical variables into number 
# based on their ranks corresponding to sale price
# It also addressed measurement errors associated with how the values have been recorded, 
# as ranks are objective.

def replace(original,new,df):
    file2[df].replace(original,new,inplace=True)

class convert:
    def __init__(self,col1,col2):
        self.col1 = col1
        self.col2 = col2
        arranged = file2.groupby(self.col1)[self.col2].median().sort_values(ascending=True)
        for a in enumerate(arranged.index.values):
            replace(a[1],a[0],self.col1) 

In [6]:
from sklearn.preprocessing import OneHotEncoder

# A list of the categorical columns that need to be changed into ranks.

recode = ['MSSubClass','MSZoning','LotShape','LandContour','Utilities','LotConfig','Neighborhood',
          'Condition1', 'Condition2','BldgType','HouseStyle','OverallQual','OverallCond','RoofStyle',
          'RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation',
          'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC',
          'CentralAir','Electrical','KitchenQual','FireplaceQu','GarageType','Functional',
          'GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','MoSold',
          'YrSold','SaleType','SaleCondition','Street','LandSlope'
         ]

for a in recode:
    c = convert(a,'SalePrice')

# There are 36 other columns that haven't been recoded that need to be dealt with. Figure out how
leftovers = []
for a in file2.columns:
    if a not in recode:
        leftovers.append(a)
        
# Numerical columns (discrete & continuous)       
numerical_cols = file2.iloc[:,1:-1].select_dtypes(exclude='object').columns

# The code below fills the null values in the numerical columns by building a pair 
# with the column it has the highest correlation with

col1 = []
col2 = []
corr = file2[numerical_cols].corr(method='spearman')
for a in (corr.index):
    col1.append(a)
    col2.append(corr[a].nlargest(3).sort_values(ascending=True)[:1].index.values[0])
corr_pairs = list(zip(col1,col2))

# Then groups the dataframe by the column to filled and fills its null values based on it's median value 
# with the column of its strongest correlation
for a,b in corr_pairs:
    file2[a] = file2[a].fillna(file2.groupby(b)[a].transform('median'))
    
# still needs to be filled
for a in numerical_cols:
    nulls = (len(file2[file2[a].isna()]))
    if nulls>0:
        file2[a] = file2[a].fillna(file2.groupby(b)[a].transform('median'))

# Seperate the target variable (Sales Price) and saved it in variable y.
y = file2.iloc[0:1460,-1]
file2.drop(['SalePrice','Alley'],axis=1,inplace=True)

# Split the combined dataframe back into test and train sets

In [7]:
train_1 = file2.iloc[0:1460,1:]
test_1 = file2.iloc[1460:,1:]
test_id = file2['Id'].iloc[1460:]
test_1.shape

(1459, 78)

# Train an XGB Regressor

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb
from sklearn.metrics import mean_squared_error

X_train,X_test,y_train,y_test = train_test_split(train_1,y,test_size = 0.3,random_state=0)

xg_reg = xgb.XGBRegressor(learning_rate=0.1,n_estimators=10000,max_depth=3,
                         subsample=0.6,colsample_bytree=0.5,gamma=0.5)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test,preds))
print('%4f'%(rmse))

26519.732007


# Pair the predicted sales prices with the requisite Id's for submission

In [9]:
test_pred = xg_reg.predict(test_1)
test_pred = list(test_pred)
test_pred = pd.DataFrame(list(zip(test_id,test_pred)))
test_pred.rename({0:'Id',1:'SalePrice'},axis=1,inplace=True)
test_pred

Unnamed: 0,Id,SalePrice
0,1461,115980.468750
1,1462,167492.031250
2,1463,185023.671875
3,1464,185061.671875
4,1465,184884.953125
...,...,...
1454,2915,78739.546875
1455,2916,74731.140625
1456,2917,178206.015625
1457,2918,119686.125000


# Save predictions as a CSV file

In [10]:
filename = 'Price Predictions 0.5.csv'
test_pred.to_csv(filename,index=False)
print('Saving '+filename)

Saving Price Predictions 0.5.csv


In [12]:
# Predictions are in the top 8% globally for all 63000 submissions