In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline

#Stylistic Choice:
plt.style.use('bmh')
sns.set_palette("RdBu_r")

# modeling imports
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score,cross_val_predict
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
# other imports
import missingno as msno

In [2]:
train = pd.read_csv('data/train_cleaned.csv')

In [3]:
ordinal = ['extercond', 'bsmtcond', 'heatingqc', 'fireplacequ', 'garagequal',
       'garagecond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2',
       'garagefinish', 'fence', 'landslope', 'lotshape', 'overallqual',
       'overallcond']
numerical = ['lotarea', 'yearremod/add', 'masvnrarea', 'bsmtfinsf1', 'bsmtfinsf2',
       'bsmtunfsf', '1stflrsf', '2ndflrsf', 'grlivarea', 'bsmtfullbath',
       'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr',
       'fireplaces', 'garagearea', 'wooddecksf', 'saleprice', 'houseage']
nominal = ['mssubclass', 'mszoning', 'landcontour', 'lotconfig', 'neighborhood',
       'condition1', 'bldgtype', 'housestyle', 'roofstyle', 'exterior2nd',
       'masvnrtype', 'foundation', 'electrical', 'functional', 'garagetype',
       'paveddrive', 'saletype', 'pool', 'centralair', 'porch']

In [4]:
train_no = train[nominal]
train_num = train[numerical]
train_ord = train[ordinal]

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 56 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2047 non-null   int64  
 1   mssubclass     2047 non-null   int64  
 2   mszoning       2047 non-null   object 
 3   lotarea        2047 non-null   int64  
 4   lotshape       2047 non-null   int64  
 5   landcontour    2047 non-null   object 
 6   lotconfig      2047 non-null   object 
 7   landslope      2047 non-null   int64  
 8   neighborhood   2047 non-null   object 
 9   condition1     2047 non-null   int64  
 10  bldgtype       2047 non-null   object 
 11  housestyle     2047 non-null   object 
 12  overallqual    2047 non-null   int64  
 13  overallcond    2047 non-null   int64  
 14  yearremod/add  2047 non-null   int64  
 15  roofstyle      2047 non-null   object 
 16  exterior2nd    2047 non-null   int64  
 17  masvnrtype     2047 non-null   object 
 18  masvnrar

In [6]:
# Correcting for wrongly imported data:
    # MSSubClass Datatype:
train['mssubclass']=train['mssubclass'].astype(str)
    # garagetype null values:
train['garagetype'].fillna(value="NA",inplace=True)

In [7]:
dummy = pd.get_dummies(train_no,drop_first=True)
dummy.shape

(2047, 86)

In [8]:
combined = train_num.join(train_ord).join(dummy)
combined = combined.reset_index()
combined.head()

Unnamed: 0,index,lotarea,yearremod/add,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,1stflrsf,2ndflrsf,grlivarea,...,functional_Sal,functional_Sev,functional_Typ,garagetype_Attchd,garagetype_Basment,garagetype_BuiltIn,garagetype_CarPort,garagetype_Detchd,paveddrive_P,paveddrive_Y
0,0,13517,2005,289.0,533.0,0.0,192.0,725,754,1479,...,0,0,1,1,0,0,0,0,0,1
1,1,11492,1997,132.0,637.0,0.0,276.0,913,1209,2122,...,0,0,1,1,0,0,0,0,0,1
2,2,7922,2007,0.0,731.0,0.0,326.0,1057,0,1057,...,0,0,1,0,0,0,0,1,0,1
3,3,9802,2007,0.0,0.0,0.0,384.0,744,700,1444,...,0,0,1,0,0,1,0,0,0,1
4,4,14235,1993,0.0,0.0,0.0,676.0,831,614,1445,...,0,0,1,0,0,0,0,1,0,0


### Analysis of Non-Chosen Variables

In [9]:
combined.shape

(2047, 122)

In [10]:
combined.columns

Index(['index', 'lotarea', 'yearremod/add', 'masvnrarea', 'bsmtfinsf1',
       'bsmtfinsf2', 'bsmtunfsf', '1stflrsf', '2ndflrsf', 'grlivarea',
       ...
       'functional_Sal', 'functional_Sev', 'functional_Typ',
       'garagetype_Attchd', 'garagetype_Basment', 'garagetype_BuiltIn',
       'garagetype_CarPort', 'garagetype_Detchd', 'paveddrive_P',
       'paveddrive_Y'],
      dtype='object', length=122)

In [24]:
combined_new = combined.drop(columns='saleprice')

In [25]:
combined_new.shape

(2047, 121)

In [26]:
poly_combined = PolynomialFeatures(include_bias=False)

In [27]:
combined_poly = poly_combined.fit_transform(combined_new)

In [28]:
len(combined_poly)

2047

In [29]:
poly_combined_cols=poly_combined.get_feature_names(combined_new.columns)
len(poly_combined_cols)

7502

In [30]:
poly_combined_df=pd.DataFrame(combined_poly,columns=poly_combined_cols)
poly_combined_df.shape

(2047, 7502)

In [31]:
poly_combined_corr = poly_combined_df.corr()[['saleprice']].sort_values('saleprice',ascending=False)
poly_combined_corr_filtered = poly_combined_corr[poly_combined_corr['saleprice']>0.8].index.tolist()

KeyboardInterrupt: 

In [None]:
poly_combined_corr_filtered

In [None]:
def subplot_regplot(dataframe,list_of_columns, suptitle, list_of_titles,datafigsize):
    fig, ax = plt.subplots((len(list_of_columns)//3)+1, 3,figsize= datafigsize) 
    fig.suptitle(suptitle,fontsize=30)
    ax = ax.ravel() # Ravel turns a matrix into a vector, which is easier to iterate
    plt.tight_layout(h_pad = 5,pad=5);
    for i, column in enumerate(list_of_columns): # Gives us an index value to get into all our lists
        sns.regplot(data= dataframe,x=dataframe[[column]],y='saleprice',ax=ax[i]) 
        ax[i].set_title(list_of_titles[i],fontdict={'fontsize': 15})
        ax[i].xaxis.set_visible(True)

In [None]:
subplot_regplot(poly_combined_df,poly_combined_corr_filtered,"Polynomial/Interaction Terms vs. Saleprice",poly_combined_corr_filtered,(20,65))