In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

In [2]:
train_clean = pd.read_csv('../datasets/train_clean.csv')
test_clean = pd.read_csv('../datasets/test_clean.csv')

## Preproccessing Categorical Features

In [3]:
explore = [
    'neighborhood', 
    'paved_drive',
    'gr_liv_area',
    'overall_qual',
    'garage_area',
    'total_bsmt_sf',
    'year_built',
    'year_remod_add',
    'mas_vnr_area',
    'fireplaces',
    'bsmtfin_sf_1',
    'central_air']

In [4]:
def null_type(lis):
    return train_clean.isnull().sum()[lis], train_clean.dtypes[lis]

In [5]:
null_type(explore)

(neighborhood      0
 paved_drive       0
 gr_liv_area       0
 overall_qual      0
 garage_area       0
 total_bsmt_sf     0
 year_built        0
 year_remod_add    0
 mas_vnr_area      0
 fireplaces        0
 bsmtfin_sf_1      0
 central_air       0
 dtype: int64,
 neighborhood       object
 paved_drive        object
 gr_liv_area       float64
 overall_qual        int64
 garage_area       float64
 total_bsmt_sf     float64
 year_built          int64
 year_remod_add      int64
 mas_vnr_area      float64
 fireplaces          int64
 bsmtfin_sf_1      float64
 central_air         int64
 dtype: object)

In [6]:
test_clean.isnull().sum()[explore], test_clean.dtypes[explore]

(neighborhood      0
 paved_drive       0
 gr_liv_area       0
 overall_qual      0
 garage_area       0
 total_bsmt_sf     0
 year_built        0
 year_remod_add    0
 mas_vnr_area      0
 fireplaces        0
 bsmtfin_sf_1      0
 central_air       0
 dtype: int64,
 neighborhood       object
 paved_drive        object
 gr_liv_area       float64
 overall_qual        int64
 garage_area       float64
 total_bsmt_sf     float64
 year_built          int64
 year_remod_add      int64
 mas_vnr_area      float64
 fireplaces          int64
 bsmtfin_sf_1      float64
 central_air         int64
 dtype: object)

In [7]:
train_clean['paved_drive'].unique()

array(['Y', 'N', 'P'], dtype=object)

In [8]:
test_clean['paved_drive'].unique()

array(['Y', 'N', 'P'], dtype=object)

### Set up a train and test split to preprocess categorical data

In [9]:
X = train_clean[explore]
y = train_clean['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 6, train_size = .8)

- Convert Paved Driveway to ordinal values

In [10]:
driveway_dict = {
    'Y': 2,
    'P': 1,
    'N': 0,
}

In [11]:
X_train['paved_drive'] = X_train['paved_drive'].map(driveway_dict)
X_test['paved_drive'] = X_test['paved_drive'].map(driveway_dict)

In [12]:
X_train['paved_drive'].dtypes

dtype('int64')

In [13]:
one_hot = OneHotEncoder(handle_unknown='ignore')
one_hot.fit(X_train[['neighborhood']])

OneHotEncoder(handle_unknown='ignore')

In [14]:
one_hot.get_feature_names_out()

array(['neighborhood_Blmngtn', 'neighborhood_Blueste',
       'neighborhood_BrDale', 'neighborhood_BrkSide',
       'neighborhood_ClearCr', 'neighborhood_CollgCr',
       'neighborhood_Crawfor', 'neighborhood_Edwards',
       'neighborhood_Gilbert', 'neighborhood_Greens',
       'neighborhood_GrnHill', 'neighborhood_IDOTRR',
       'neighborhood_Landmrk', 'neighborhood_MeadowV',
       'neighborhood_Mitchel', 'neighborhood_NAmes',
       'neighborhood_NPkVill', 'neighborhood_NWAmes',
       'neighborhood_NoRidge', 'neighborhood_NridgHt',
       'neighborhood_OldTown', 'neighborhood_SWISU',
       'neighborhood_Sawyer', 'neighborhood_SawyerW',
       'neighborhood_Somerst', 'neighborhood_StoneBr',
       'neighborhood_Timber', 'neighborhood_Veenker'], dtype=object)

In [15]:
train_one_hot = pd.DataFrame(
    one_hot.transform(X_train[['neighborhood']]).todense(),
    columns = one_hot.get_feature_names_out(),
    index = X_train.index
)

train_one_hot.head(1)

Unnamed: 0,neighborhood_Blmngtn,neighborhood_Blueste,neighborhood_BrDale,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_CollgCr,neighborhood_Crawfor,neighborhood_Edwards,neighborhood_Gilbert,neighborhood_Greens,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
test_one_hot = pd.DataFrame(
    one_hot.transform(X_test[['neighborhood']]).todense(),
    columns = one_hot.get_feature_names_out(),
    index = X_test.index
)

In [17]:
X_train = pd.concat(
    [X_train.drop(columns=['neighborhood']), train_one_hot],
    axis=1
)

X_train.head(1)

Unnamed: 0,paved_drive,gr_liv_area,overall_qual,garage_area,total_bsmt_sf,year_built,year_remod_add,mas_vnr_area,fireplaces,bsmtfin_sf_1,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
459,2,1551.0,5,240.0,1058.0,1938,1950,0.0,0,930.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_test = pd.concat(
    [X_test.drop(columns=['neighborhood']), test_one_hot],
    axis=1
)
X_test.head(1)

Unnamed: 0,paved_drive,gr_liv_area,overall_qual,garage_area,total_bsmt_sf,year_built,year_remod_add,mas_vnr_area,fireplaces,bsmtfin_sf_1,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
534,2,1728.0,5,504.0,1728.0,1963,1963,336.0,0,1332.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Modeling

In [19]:
mlr = LinearRegression()

mlr.fit(X_train, y_train)

print('Train set R-squared value: ', mlr.score(X_train, y_train), '\n')
print('Test set R-squared value: ', mlr.score(X_test, y_test), '\n')
print('Intercept: ', mlr.intercept_, '\n')

    # Inefficient code for printing coefficients with corresponding features.
    
            #coefs = mlr.coef_
            #cols = X_train.columns
            #coef_dict = {cols[i]: coefs[i] for i in range(len(X_train.columns))}
            #print('Coefficients')
            #for key in coef_dict:
                #print(key, ': ', coef_dict[key])

pd.DataFrame(zip(X_train.columns, mlr.coef_))

Train set R-squared value:  0.8752067233853532 

Test set R-squared value:  0.8838115691876284 

Intercept:  -1200899.254877638 



Unnamed: 0,0,1
0,paved_drive,-2172.320007
1,gr_liv_area,50.161779
2,overall_qual,13616.39287
3,garage_area,37.272308
4,total_bsmt_sf,21.955231
5,year_built,241.602907
6,year_remod_add,351.290569
7,mas_vnr_area,37.141319
8,fireplaces,6995.422674
9,bsmtfin_sf_1,24.748471


In [21]:
X_train.to_csv('../datasets/X_train.csv', index = False)
y_train.to_csv('../datasets/y_train.csv', index = False)
X_test.to_csv('../datasets/X_test.csv', index = False)
y_test.to_csv('../datasets/y_test.csv', index = False)