In [1]:
# Import
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# Data Cleaning

In [3]:
#load in the data
#train has the sales prices, test does not
test_df = pd.read_csv('datasets/test.csv')
test_df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
#Remove spaces and capitalization from column headers

def caps_spaces(df):
    column_headers = []
    for i in df.columns:
        i = i.lower()
        i = i.replace(' ', '_')
        column_headers.append(i)
    df.columns = column_headers

caps_spaces(test_df)
# caps_spaces(test_df)

In [5]:
#identify the columns that are missing values and those that are not
#Can we clean the ones that are missing? If not, should those rows be dropped?

nulls = test_df.isnull().sum()
nulls = nulls[nulls.values > 0]
# nulls[nulls.values == 0]
nulls

lot_frontage      160
alley             820
mas_vnr_type        1
mas_vnr_area        1
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
pool_qc           874
fence             706
misc_feature      837
dtype: int64

In [6]:
test_df[list(nulls.index)].head()

Unnamed: 0,lot_frontage,alley,mas_vnr_type,mas_vnr_area,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,electrical,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_qual,garage_cond,pool_qc,fence,misc_feature
0,69.0,Grvl,,0.0,Fa,TA,No,Unf,Unf,FuseP,,Detchd,1910.0,Unf,Po,Po,,,
1,,,,0.0,Gd,TA,No,Unf,Unf,SBrkr,,Attchd,1977.0,Fin,TA,TA,,,
2,58.0,,,0.0,Gd,Gd,Av,GLQ,Unf,SBrkr,Gd,Attchd,2006.0,RFn,TA,TA,,,
3,60.0,,,0.0,TA,TA,No,Unf,Unf,SBrkr,,Detchd,1935.0,Unf,Fa,TA,,,
4,,,BrkFace,247.0,Gd,TA,No,BLQ,Unf,SBrkr,Gd,Attchd,1963.0,RFn,TA,TA,,,


In [7]:
#Collect numeric features for modeling
numeric_cols = [i for i in test_df.dtypes.index if test_df.dtypes[i] != 'object']
numeric_df = test_df[numeric_cols]

In [8]:
#Impute missing numerical values with the mean

for i in numeric_df.columns:
    if numeric_df[i].isnull().sum() != 0:
        numeric_df[i].fillna(numeric_df[i].mean(), inplace=True)

numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   lot_frontage     878 non-null    float64
 4   lot_area         878 non-null    int64  
 5   overall_qual     878 non-null    int64  
 6   overall_cond     878 non-null    int64  
 7   year_built       878 non-null    int64  
 8   year_remod/add   878 non-null    int64  
 9   mas_vnr_area     878 non-null    float64
 10  bsmtfin_sf_1     878 non-null    int64  
 11  bsmtfin_sf_2     878 non-null    int64  
 12  bsmt_unf_sf      878 non-null    int64  
 13  total_bsmt_sf    878 non-null    int64  
 14  1st_flr_sf       878 non-null    int64  
 15  2nd_flr_sf       878 non-null    int64  
 16  low_qual_fin_sf  878 non-null    int64  
 17  gr_liv_area     

In [9]:
# Replace missing numeric basement ('bsmt') values with mean.
for i in test_df.columns:
    if 'bsmt' in i and test_df.dtypes[i] != 'object':
        test_df[i].fillna(test_df[i].mean(), inplace=True)
#         test_df[i].astype(float)

In [10]:
#Collect string features incase any are worth dummying
object_cols = [i for i in test_df.dtypes.index if test_df.dtypes[i] == 'object']
object_df = test_df[object_cols]
# object_df.info()

In [11]:
## Cell for checking the spread of individual feature value_counts
# object_df['garage_type'].value_counts()
# plt.scatter(train_df['saleprice'], train_df['garage_type'])

In [12]:
# bldg_type, exter_cond, central_air and kitchen_qual have 0 nulls
# Dummy them and add to the numeric_df for more percise modeling
numeric_df = pd.concat([numeric_df, pd.get_dummies(test_df[['bldg_type', 'exter_cond', 'central_air', 'kitchen_qual']], drop_first=True)], axis=1)
# numeric_df.head()

### Correct or clean rows that are not numerical
train_df = train_df[list(train_df.describe().columns)]

### Linear Regression

In [14]:
#Define features
features = [col for col in numeric_df.columns if col != 'saleprice']
X = numeric_df[features]

In [None]:
#Test train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8)

In [None]:
# instantiate the estimator
lr = LinearRegression(n_jobs = -1)

#fit the estimater
model = lr.fit(X_train, y_train)

# perform 5-fold cross validation
scores = cross_val_score(lr, X, y)

print('Cross-val cores:', scores)
print(f'Mean with confidence inteveral: {round(scores.mean(), 3)} +- {round(2 * scores.std(), 2)} \n')
print(f'R^2 train score: {lr.score(X_train, y_train)}')
print(f'R^2 test score: {lr.score(X_test, y_test)}')

### Ridge

In [None]:
#Define features
# features = [col for col in numeric_df.columns if col != 'saleprice']
# y = numeric_df['saleprice']

# fit standard scaler
ss = StandardScaler()
ss.fit(numeric_df[features], y)

# save the standardized values
numeric_df = pd.DataFrame(ss.transform(numeric_df[features]), columns = features)
numeric_df['saleprice'] = y
X = numeric_df[features]

#Test train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8)

#fit the estimater
model = lr.fit(X_train, y_train)

# perform 5-fold cross validation
scores = cross_val_score(lr, X, y)

print('Cross-val scores:', scores)
print(f'R^2 train score: {lr.score(X_train, y_train)}')
print(f'R^2 test score: {lr.score(X_test, y_test)}')

### Ridge (Reduced Features)

In [None]:
# which features are 'irrelevant'? do Lasso!
# below are 200 alphas w/ step of 0.05
l_alphas = np.arange(0.01, 10.0, 0.05)

# This is the same as the ridge coefficient by alpha calculator
def lasso_coefs(X, Y, alphas):
    coefs = []
    lasso_reg = Lasso()
    for a in alphas:
        lasso_reg.set_params(alpha=a)
        lasso_reg.fit(X, Y)
        coefs.append(lasso_reg.coef_)
        
    return coefs

# model using lasso_coefs function from above
l_coefs = lasso_coefs(X, y, l_alphas)

saleprice_coefs_df = pd.DataFrame(l_coefs, columns = features)
saleprice_coefs_df['alpha'] = l_alphas
saleprice_coefs_df.head(3)

In [None]:
columns_to_filter_out = [col for col, coef in saleprice_coefs_df.iloc[saleprice_coefs_df.index.max()]\
                         .iteritems() if not coef]

new_features = [col for col in features if col not in columns_to_filter_out]

columns_to_filter_out

In [None]:
#fit the estimater
model = lr.fit(X_train, y_train)

scores_2 = cross_val_score(lr, 
                         numeric_df[new_features], 
                         y,
                         cv = 5, n_jobs = -1)
print('Cross-val scores:', scores_2)
print(f'R^2 train score: {lr.score(X_train, y_train)}')
print(f'R^2 test score: {lr.score(X_test, y_test)}')

In [None]:
# Which alpha is best?

from sklearn.linear_model import LassoCV


# fit LassoCV
lasso_cv = LassoCV(cv = 10).fit(X, y)

print('best alpha:', lasso_cv.alpha_)
print('score:', lasso_cv.score(X, y))

## Lasso

In [None]:
new_lasso = Lasso(alpha = lasso_cv.alpha_)
model = new_lasso.fit(X, y)
final_columns = [col for col, coef in zip(X.columns, new_lasso.coef_) if coef]

In [None]:
# fit final linear regression
lr = LinearRegression(n_jobs = -1)

scores_3 = cross_val_score(new_lasso, X[final_columns], y,
                          cv = 5, n_jobs = -1)


print('Cross-val scores:', scores_3)
print(f'R^2 train score: {new_lasso.score(X_train, y_train)}')
print(f'R^2 test score: {new_lasso.score(X_test, y_test)}')

In [None]:
# fit final linear regression
lr = LinearRegression(n_jobs = -1)

scores_3 = cross_val_score(lr, X[final_columns], y,
                          cv = 5, n_jobs = -1)
predictions = model.predict(X)
submission = pd.DataFrame(predictions, columns = ['SalePrice'])
submission.index.rename('ID', inplace = True)
submission.to_csv('submission.csv')

## Final Submission

In [None]:
sample_sub = pd.read_csv('datasets/sample_sub_reg.csv')
len(test_df)
# test_df.info()