In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

%matplotlib inline

In [None]:
train = pd.read_csv('./datasets/train_clean2.csv')

In [None]:
test = pd.read_csv('./datasets/test_clean2.csv')

In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum().sum()

In [None]:
train.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
test.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
numeric_train = list(train._get_numeric_data().columns)

In [None]:
numeric_test = list(test._get_numeric_data().columns)

In [None]:
train = train.join(pd.get_dummies(train['ms_zoning'], drop_first=True, prefix='ms_z'))
test = test.join(pd.get_dummies(test['ms_zoning'], drop_first=True, prefix='ms_z'))

In [None]:
train = train.join(pd.get_dummies(train['lot_shape'], drop_first=True, prefix='lot_sh'))
test = test.join(pd.get_dummies(test['lot_shape'], drop_first=True, prefix='lot_sh'))

In [None]:
train = train.join(pd.get_dummies(train['lot_config'], drop_first=True, prefix='lot_conf'))
test = test.join(pd.get_dummies(test['lot_config'], drop_first=True, prefix='lot_conf'))

In [None]:
train = train.join(pd.get_dummies(train['mas_vnr_type'], drop_first=True, prefix='mas_vnr_typ'))
test = test.join(pd.get_dummies(test['mas_vnr_type'], drop_first=True, prefix='mas_vnr_typ'))

In [None]:
train = train.join(pd.get_dummies(train['exter_qual'], drop_first=True, prefix='ext_qu'))
test = test.join(pd.get_dummies(test['exter_qual'], drop_first=True, prefix='ext_qu'))

In [None]:
train = train.join(pd.get_dummies(train['bsmt_qual'], drop_first=True, prefix='bsm_q'))
test = test.join(pd.get_dummies(test['bsmt_qual'], drop_first=True, prefix='bsm_q'))

In [None]:
train = train.join(pd.get_dummies(train['bsmtfin_type_1'], drop_first=True, prefix='bsm_fin'))
test = test.join(pd.get_dummies(test['bsmtfin_type_1'], drop_first=True, prefix='bsm_fin'))

In [None]:
train = train.join(pd.get_dummies(train['heating_qc'], drop_first=True, prefix='bsm_fin'));
test = test.join(pd.get_dummies(test['heating_qc'], drop_first=True, prefix='bsm_fin'));

In [None]:
train = train.join(pd.get_dummies(train['kitchen_qual'], drop_first=True, prefix='kitchen'))
test = test.join(pd.get_dummies(test['kitchen_qual'], drop_first=True, prefix='kitchen'))

In [None]:
train = train.join(pd.get_dummies(train['garage_type'], drop_first=True, prefix='grg_typ'))
test = test.join(pd.get_dummies(test['garage_type'], drop_first=True, prefix='grg_typ'))

In [None]:
train = train.join(pd.get_dummies(train['sale_type'], drop_first=True, prefix='sale_typ'))
test = test.join(pd.get_dummies(test['sale_type'], drop_first=True, prefix='sale_typ'))

In [None]:
train['pid_suffix'] = train['pid'].map(lambda pid: str(pid)[2:4]).astype(int)
test['pid_suffix'] = test['pid'].map(lambda pid: str(pid)[2:4]).astype(int)

In [None]:
poly = PolynomialFeatures(include_bias=False)
poly_feat = ['overall_qual', 'year_built', 'fireplaces', 'totrms_abvgrd',
             'mas_vnr_area', 'open_porch_sf', 'wood_deck_sf']

poly_train = train[poly_feat]
poly_test = test[poly_feat]

X_poly_train = poly.fit_transform(poly_train)
X_poly_test = poly.fit_transform(poly_test)


In [None]:
train = train.join(pd.DataFrame(X_poly_train, columns=poly.get_feature_names(poly_feat)).drop(columns=poly_feat))
test = test.join(pd.DataFrame(X_poly_test, columns=poly.get_feature_names(poly_feat)).drop(columns=poly_feat))


In [None]:
train['garge_area*cars*yr'] = train['garage_area']*train['garage_cars']*train['garage_yr_blt']
test['garge_area*cars*yr'] = test['garage_area']*test['garage_cars']*test['garage_yr_blt']

In [None]:
train['sf*qual'] = train['gr_liv_area']*train['overall_qual'] 
test['sf*qual'] = test['gr_liv_area']*test['overall_qual'] 

In [None]:
train['bath*totrms'] = train['full_bath']*train['totrms_abvgrd']
test['bath*totrms'] = test['full_bath']*test['totrms_abvgrd']

In [None]:
X_train_ = train._get_numeric_data().drop(columns=['saleprice'])
y_train_ = train['saleprice']

In [None]:
X_test_ = test._get_numeric_data()

In [116]:
X_train_ = sm.add_constant(X_train_)
model_train = sm.OLS(y_train_, X_train_).fit()

(2051, 122)

In [None]:
model_train.summary();
p_train_df = pd.DataFrame(model_train.pvalues.loc[model_train.pvalues < 0.05]).reset_index().rename(columns={'index':'col', 0:'pval'})

In [None]:
p_train_list = list(p_train_df['col'])

In [None]:
p_train_list.append('id')

In [117]:
X_train_ = train[[
    'ms_subclass',
 'lot_area',
 'overall_cond',
 'mas_vnr_area',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'total_bsmt_sf',
 '1st_flr_sf',
 '2nd_flr_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'full_bath',
 'kitchen_abvgr',
 'fireplaces',
 'garage_area',
 'enclosed_porch',
 'screen_porch',
 'pool_area',
 'misc_val',
 'ms_z_FV',
 'ms_z_RH',
 'ms_z_RL',
 'ms_z_RM',
 'lot_sh_IR3',
 'lot_sh_Reg',
 'lot_conf_CulDSac',
 'mas_vnr_typ_None',
 'ext_qu_Fa',
 'ext_qu_Gd',
 'ext_qu_TA',
 'bsm_q_Fa',
 'bsm_q_Gd',
 'bsm_q_None',
 'bsm_q_TA',
 'bsm_fin_None',
 'bsm_fin_TA',
 'kitchen_Fa',
 'kitchen_Gd',
 'kitchen_TA',
 'grg_typ_Attchd',
 'grg_typ_BuiltIn',
 'grg_typ_Detchd',
 'sale_typ_Con',
 'sale_typ_New',
 'sale_typ_Oth',
 'sale_typ_WD ',
 'pid_suffix',
 'overall_qual^2',
 'overall_qual fireplaces',
 'overall_qual totrms_abvgrd',
 'year_built fireplaces',
 'year_built mas_vnr_area',
 'fireplaces totrms_abvgrd',
 'totrms_abvgrd^2',
 'totrms_abvgrd open_porch_sf',
 'mas_vnr_area^2',
 'mas_vnr_area open_porch_sf',
 'open_porch_sf^2',
 'garge_area*cars*yr',
 'sf*qual',
 'bath*totrms',
 'id']]
y_train_ = train['saleprice']

In [118]:
X_train_.shape

(2051, 62)

In [119]:
y_train_.shape

(2051,)

In [120]:
X_test_ = test[[
 'ms_subclass',
 'lot_area',
 'overall_cond',
 'mas_vnr_area',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'total_bsmt_sf',
 '1st_flr_sf',
 '2nd_flr_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'full_bath',
 'kitchen_abvgr',
 'fireplaces',
 'garage_area',
 'enclosed_porch',
 'screen_porch',
 'pool_area',
 'misc_val',
 'ms_z_FV',
 'ms_z_RH',
 'ms_z_RL',
 'ms_z_RM',
 'lot_sh_IR3',
 'lot_sh_Reg',
 'lot_conf_CulDSac',
 'mas_vnr_typ_None',
 'ext_qu_Fa',
 'ext_qu_Gd',
 'ext_qu_TA',
 'bsm_q_Fa',
 'bsm_q_Gd',
 'bsm_q_None',
 'bsm_q_TA',
 'bsm_fin_None',
 'bsm_fin_TA',
 'kitchen_Fa',
 'kitchen_Gd',
 'kitchen_TA',
 'grg_typ_Attchd',
 'grg_typ_BuiltIn',
 'grg_typ_Detchd',
 'sale_typ_Con',
 'sale_typ_New',
 'sale_typ_Oth',
 'sale_typ_WD ',
 'pid_suffix',
 'overall_qual^2',
 'overall_qual fireplaces',
 'overall_qual totrms_abvgrd',
 'year_built fireplaces',
 'year_built mas_vnr_area',
 'fireplaces totrms_abvgrd',
 'totrms_abvgrd^2',
 'totrms_abvgrd open_porch_sf',
 'mas_vnr_area^2',
 'mas_vnr_area open_porch_sf',
 'open_porch_sf^2',
 'garge_area*cars*yr',
 'sf*qual',
 'bath*totrms',
 'id']]

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, random_state=42)

In [122]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [123]:
lr = LinearRegression()

In [124]:
cross_val_score(lr, X_train_sc, y_train, cv=5).mean()

0.8207079310453288

In [125]:
lr.fit(X_train_sc, y_train)
lr.score(X_train_sc, y_train)

0.8896528860579465

In [126]:
lr.score(X_test_sc, y_test)

0.9036077599705857

In [127]:
[c for c in X_train_.columns if c not in X_test_.columns]

[]

In [128]:
X_test_final = ss.transform(X_test_)

In [129]:
predictions = lr.predict(X_test_final)

In [130]:
X_test_;

In [131]:
sec_subm = pd.DataFrame({'Id': X_test_['id'], 
                                   'SalePrice': predictions})

In [132]:
sec_subm

Unnamed: 0,Id,SalePrice
0,2658,134395.419892
1,2718,154251.833671
2,2414,223532.476201
3,1989,110723.923080
4,625,202462.350083
...,...,...
874,1662,197981.358549
875,1234,188804.895415
876,1373,135224.497909
877,1672,108030.999582


In [133]:
sec_subm.to_csv(path_or_buf='datasets/sec_subm.csv', index=False)

In [None]:
for col in X.columns: 
    if col not in X_test1.columns:
        print(col)

In [None]:
X_test1_final = ss.transform(X_test)

In [None]:
predictions = lr.predict(X_test1_final)

In [None]:
first_subm = pd.DataFrame({'Id': X_test1['Id'], 
                                   'SalePrice': predictions})

In [None]:
first_subm

In [None]:
first_subm.to_csv(path_or_buf='datasets/first_subm.csv', index=False)
#index=False Unnamed doesn't show!! 