### Imports

In [137]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

%matplotlib inline

In [138]:
train_df = pickle.load(open('./pickles/Clean_DataFrame_v3.pkl', 'rb'))
kaggle_df = pd.read_csv('./datasets/test.csv')
kaggle_df = kaggle_df.set_index('Id')


### Cleaning Test Data the same way Training Data was Cleaned

In [139]:
kaggle_df.columns = [column_name.lower().replace(' ','_') for column_name in kaggle_df.columns]

In [140]:
def impute_avg_of_neighborhood(df):
    
    #for later use in the function of math.isnan(x)
    #Return True if x is a NaN (not a number), and False otherwise.
    import math
    
    #returns a list of all the columns with any null values
    null_columns = df.columns[df.isna().any()].tolist()
    
    for i in null_columns:
        if df[i].dtype == 'object':
            df[i].replace(np.nan, 'none', inplace = True)
    
    for i in null_columns:
        if df[i].dtype == 'float64':
            
            #getting indexes of all null values in that column
            null_indexes = df[df[i].isnull()].index
            
            #creating a dictionary to store mean values of each neighborhood per the column variable
            key = list(df[['neighborhood', i]].groupby('neighborhood').mean().index)
            value = list(df[['neighborhood', i]].groupby('neighborhood').mean()[i].values)
            mean_dict = dict(zip(key, value))
            
            #of the null columns, if a relating categorical column says "none", then impute with 0.0's 
            for x in null_indexes:
                if 'mas_vnr' in i:
                    if df.at[ x,'mas_vnr_type'] == 'none':
                        df.at[x, i] = 0.0
                elif 'bsmt' in i:  
                    if df.at[ x,'bsmt_qual'] == 'none':
                        df.at[x, i] = 0.0
                elif 'pool' in i:  
                    if df.at[ x,'pool_qc'] == 'none':
                        df.at[x, i] = 0.0                 
                elif 'garage' in i:  
                    if df.at[ x,'garage_type'] == 'none':
                        df.at[x, i] = 0.0
                        
                    else:
                        mean_dict_key = df.at[x,'neighborhood']
                    
                        if math.isnan(mean_dict[mean_dict_key]):
                    
                            df.at[x, i] = 0
                        
                        else:
                            df.at[x, i] = mean_dict[mean_dict_key]
               
            #otherwise impute with the avg value of the house's neighborhood
                else:
                    mean_dict_key = df.at[x,'neighborhood']
                    
                    if math.isnan(mean_dict[mean_dict_key]):
                    
                        df.at[x, i] = 0
                        
                    else:
                        df.at[x, i] = mean_dict[mean_dict_key]
                    
    return df

In [141]:
impute_avg_of_neighborhood(kaggle_df).shape

(878, 79)

In [142]:
kaggle_df.isnull().sum().sort_values(ascending = True)

pid              0
fireplaces       0
functional       0
totrms_abvgrd    0
kitchen_qual     0
                ..
exterior_1st     0
roof_matl        0
roof_style       0
exter_cond       0
sale_type        0
Length: 79, dtype: int64

Creating Ordinal Variables

In [143]:
bsmt_qual_dict = dict(zip(['none', 'Po', 'Fa', 'TA','Gd', 'Ex'], range(6)))
kaggle_df['bsmt_qual'] = kaggle_df['bsmt_qual'].apply(lambda x:bsmt_qual_dict[x])

kitchen_qual_dict = dict(zip(['Po', 'Fa', 'TA','Gd', 'Ex'], range(1, 6)))
kaggle_df['kitchen_qual'] = kaggle_df['kitchen_qual'].apply(lambda x:kitchen_qual_dict[x])

exter_qual_dict = dict(zip(['Po', 'Fa', 'TA','Gd', 'Ex'], range(1, 6)))
kaggle_df['exter_qual'] = kaggle_df['exter_qual'].apply(lambda x:exter_qual_dict[x])

heating_qc_dict = dict(zip(['Po', 'Fa', 'TA','Gd', 'Ex'], range(1, 6)))
kaggle_df['heating_qc'] = kaggle_df['heating_qc'].apply(lambda x:heating_qc_dict[x])

fireplace_qu_dict = dict(zip(['none', 'Po', 'Fa', 'TA','Gd', 'Ex'], range(6)))
kaggle_df['fireplace_qu'] = kaggle_df['fireplace_qu'].apply(lambda x:fireplace_qu_dict[x])

garage_finish_dict = dict(zip(['none', 'Unf', 'RFn', 'Fin'], range(4)))
kaggle_df['garage_finish'] = kaggle_df['garage_finish'].apply(lambda x:garage_finish_dict[x])

Creating Categorical Dummy Variables

In [144]:
kaggle_df = pd.get_dummies(kaggle_df, columns = ['foundation', 'bsmtfin_type_1', 'neighborhood', 'mas_vnr_type'] )
kaggle_df.columns = [column_name.lower().replace(' ','_') for column_name in kaggle_df.columns]

In [145]:
kaggle_df['overall_livingarea'] = kaggle_df['overall_qual']* kaggle_df['gr_liv_area']

Evaluating My Model here:

In [146]:
y =  np.log(train_df['saleprice'])
X = train_df[['overall_qual', 'overall_cond', 'ms_subclass','enclosed_porch', 'gr_liv_area', 'garage_area', 
         '1st_flr_sf', 'total_bsmt_sf', 'year_built', 'year_remod/add','full_bath'
        ,'mas_vnr_area', 'fireplaces', 'bsmtfin_sf_1', 'bsmtfin_sf_2','lot_frontage', 
        'lot_area', 'screen_porch', 'open_porch_sf', 'mo_sold','bsmt_qual', 'kitchen_qual',
              'exter_qual', 'heating_qc', 'fireplace_qu', 'garage_finish', 'foundation_pconc', 'bsmtfin_type_1_glq', 
              'neighborhood_nridght','neighborhood_stonebr', 'mas_vnr_type_none', 'overall_livingarea']]

In [147]:
model = LinearRegression()
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [148]:
y_hat = model.predict(X)

In [149]:
np.sqrt(metrics.mean_squared_error(np.exp(y), np.exp(y_hat)))

21539.744899171856

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 31)
cross_val_score(model, X_train, y_train, cv = 5).mean()

0.8999358610428766

In [151]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.9053367485477202
0.896860893237959


Applying Model Features to Test Data

In [152]:
X_kaggle = kaggle_df[['overall_qual', 'overall_cond', 'ms_subclass','enclosed_porch', 'gr_liv_area', 'garage_area', 
         '1st_flr_sf', 'total_bsmt_sf', 'year_built', 'year_remod/add','full_bath'
        ,'mas_vnr_area', 'fireplaces', 'bsmtfin_sf_1', 'bsmtfin_sf_2','lot_frontage', 
        'lot_area', 'screen_porch', 'open_porch_sf', 'mo_sold','bsmt_qual', 'kitchen_qual',
              'exter_qual', 'heating_qc', 'fireplace_qu', 'garage_finish', 'foundation_pconc', 'bsmtfin_type_1_glq', 'neighborhood_nridght','neighborhood_stonebr', 'mas_vnr_type_none', 'overall_livingarea']]

In [153]:
kaggle_df['pred_price'] = np.exp(model.predict(X_kaggle))

kaggle_preds = kaggle_df['pred_price']

In [154]:
submission = pd.DataFrame(kaggle_preds, index = kaggle_df.index)
submission.columns = ['SalePrice']

In [155]:
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,139860.945735
2718,161886.800279
2414,205302.162982
1989,106484.014806
625,173020.204220
...,...
1662,189030.286911
1234,219953.668105
1373,131048.857395
1672,113341.845450


Exporting Submission File

In [156]:
submission.to_csv('./datasets/submission9.csv')