In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV



In [32]:
df = pd.read_csv('../datasets/train_clean_features.csv')

In [33]:
df.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,...,Mo Sold_7,Mo Sold_8,Mo Sold_9,Mo Sold_10,Mo Sold_11,Mo Sold_12,Yr Sold_2007,Yr Sold_2008,Yr Sold_2009,Yr Sold_2010
0,109,0.0,13517,3,4,3,6,8,1976,2005,...,0,0,0,0,0,0,0,0,0,1
1,544,43.0,11492,3,4,3,7,5,1996,1997,...,0,0,0,0,0,0,0,0,1,0
2,153,68.0,7922,4,4,3,5,7,1953,2007,...,0,0,0,0,0,0,0,0,0,1
3,318,73.0,9802,4,4,3,5,5,2006,2007,...,0,0,0,0,0,0,0,0,0,1
4,255,82.0,14235,3,4,3,6,8,1900,1993,...,0,0,0,0,0,0,0,0,0,1


In [34]:
# Drop Id column
df.drop(columns='Id', inplace=True)

In [35]:
# Set X and y
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [36]:
X.head()

Unnamed: 0,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Mo Sold_7,Mo Sold_8,Mo Sold_9,Mo Sold_10,Mo Sold_11,Mo Sold_12,Yr Sold_2007,Yr Sold_2008,Yr Sold_2009,Yr Sold_2010
0,0.0,13517,3,4,3,6,8,1976,2005,289.0,...,0,0,0,0,0,0,0,0,0,1
1,43.0,11492,3,4,3,7,5,1996,1997,132.0,...,0,0,0,0,0,0,0,0,1,0
2,68.0,7922,4,4,3,5,7,1953,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
3,73.0,9802,4,4,3,5,5,2006,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
4,82.0,14235,3,4,3,6,8,1900,1993,0.0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
# Store the names of all the dummy variables here
dummy = ['MS SubClass_150',
 'MS SubClass_160',
 'MS SubClass_180',
 'MS SubClass_190',
 'MS SubClass_20',
 'MS SubClass_30',
 'MS SubClass_40',
 'MS SubClass_45',
 'MS SubClass_50',
 'MS SubClass_60',
 'MS SubClass_70',
 'MS SubClass_75',
 'MS SubClass_80',
 'MS SubClass_85',
 'MS SubClass_90',
 'MS Zoning_C (all)',
 'MS Zoning_FV',
 'MS Zoning_I (all)',
 'MS Zoning_RH',
 'MS Zoning_RL',
 'MS Zoning_RM',
 'Street_Pave',
 'Alley_None',
 'Alley_Pave',
 'Land Contour_HLS',
 'Land Contour_Low',
 'Land Contour_Lvl',
 'Lot Config_CulDSac',
 'Lot Config_FR2',
 'Lot Config_FR3',
 'Lot Config_Inside',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_Greens',
 'Neighborhood_GrnHill',
 'Neighborhood_IDOTRR',
 'Neighborhood_Landmrk',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighborhood_NPkVill',
 'Neighborhood_NWAmes',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_OldTown',
 'Neighborhood_SWISU',
 'Neighborhood_Sawyer',
 'Neighborhood_SawyerW',
 'Neighborhood_Somerst',
 'Neighborhood_StoneBr',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'Condition 1_Feedr',
 'Condition 1_Norm',
 'Condition 1_PosA',
 'Condition 1_PosN',
 'Condition 1_RRAe',
 'Condition 1_RRAn',
 'Condition 1_RRNe',
 'Condition 1_RRNn',
 'Condition 2_Feedr',
 'Condition 2_Norm',
 'Condition 2_PosA',
 'Condition 2_PosN',
 'Condition 2_RRAe',
 'Condition 2_RRAn',
 'Condition 2_RRNn',
 'Bldg Type_2fmCon',
 'Bldg Type_Duplex',
 'Bldg Type_Twnhs',
 'Bldg Type_TwnhsE',
 'House Style_1.5Unf',
 'House Style_1Story',
 'House Style_2.5Fin',
 'House Style_2.5Unf',
 'House Style_2Story',
 'House Style_SFoyer',
 'House Style_SLvl',
 'Roof Style_Gable',
 'Roof Style_Gambrel',
 'Roof Style_Hip',
 'Roof Style_Mansard',
 'Roof Style_Shed',
 'Roof Matl_Membran',
 'Roof Matl_Metal',
 'Roof Matl_Roll',
 'Roof Matl_Tar&Grv',
 'Roof Matl_WdShake',
 'Roof Matl_WdShngl',
 'Exterior 1st_AsphShn',
 'Exterior 1st_BrkComm',
 'Exterior 1st_BrkFace',
 'Exterior 1st_CBlock',
 'Exterior 1st_CemntBd',
 'Exterior 1st_HdBoard',
 'Exterior 1st_ImStucc',
 'Exterior 1st_MetalSd',
 'Exterior 1st_Plywood',
 'Exterior 1st_PreCast',
 'Exterior 1st_Stone',
 'Exterior 1st_Stucco',
 'Exterior 1st_VinylSd',
 'Exterior 1st_Wd Sdng',
 'Exterior 1st_WdShing',
 'Exterior 2nd_AsphShn',
 'Exterior 2nd_Brk Cmn',
 'Exterior 2nd_BrkFace',
 'Exterior 2nd_CBlock',
 'Exterior 2nd_CmentBd',
 'Exterior 2nd_HdBoard',
 'Exterior 2nd_ImStucc',
 'Exterior 2nd_MetalSd',
 'Exterior 2nd_Other',
 'Exterior 2nd_Plywood',
 'Exterior 2nd_PreCast',
 'Exterior 2nd_Stone',
 'Exterior 2nd_Stucco',
 'Exterior 2nd_VinylSd',
 'Exterior 2nd_Wd Sdng',
 'Exterior 2nd_Wd Shng',
 'Mas Vnr Type_BrkFace',
 'Mas Vnr Type_CBlock',
 'Mas Vnr Type_None',
 'Mas Vnr Type_Stone',
 'Foundation_CBlock',
 'Foundation_PConc',
 'Foundation_Slab',
 'Foundation_Stone',
 'Foundation_Wood',
 'Heating_GasA',
 'Heating_GasW',
 'Heating_Grav',
 'Heating_OthW',
 'Heating_Wall',
 'Central Air_Y',
 'Garage Type_Attchd',
 'Garage Type_Basment',
 'Garage Type_BuiltIn',
 'Garage Type_CarPort',
 'Garage Type_Detchd',
 'Garage Type_None',
 'Misc Feature_None',
 'Misc Feature_Othr',
 'Misc Feature_Shed',
 'Misc Feature_TenC',
 'Sale Type_CWD',
 'Sale Type_Con',
 'Sale Type_ConLD',
 'Sale Type_ConLI',
 'Sale Type_ConLw',
 'Sale Type_New',
 'Sale Type_Oth',
 'Sale Type_VWD',
 'Sale Type_WD ']


In [14]:
# Drop dummy variables to perform polynomial transformations
#X_dummy = X[dummy]
#X.drop(columns=list(dummy), inplace = True)

In [37]:
# #Generates the full polynomial feature table.  
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(X)
X_poly.shape

(2047, 27260)

In [38]:
#Adds appropriate feature names to all polynomial features
X_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(X.columns))

#Generates list of poly feature correlations
X_poly_corrs = X_poly.corrwith(y)
#Shows features most highly correlated (positively) with sale price, poly features included
X_poly_corrs.sort_values(ascending=False).head(20)

Overall Qual Gr Liv Area       0.872889
Exter Qual Gr Liv Area         0.854272
Gr Liv Area Kitchen Qual       0.843372
Bsmt Qual Gr Liv Area          0.842613
Overall Qual 1st Flr SF        0.840775
Overall Qual Kitchen Qual      0.837906
Overall Qual Exter Qual        0.833346
Overall Qual^2                 0.830444
Overall Qual Total Bsmt SF     0.828246
Overall Qual Garage Area       0.825224
Overall Qual Garage Cars       0.822140
Total Bsmt SF Gr Liv Area      0.819940
Overall Qual Total Bath        0.818214
Overall Qual Bsmt Qual         0.815917
Overall Qual Year Built        0.810035
Overall Qual TotRms AbvGrd     0.809770
Gr Liv Area Garage Area        0.808308
Overall Qual Year Remod/Add    0.807846
Gr Liv Area Garage Cars        0.806506
Overall Qual                   0.803287
dtype: float64

In [18]:
#Shows features most highly correlated (negatively) with sale price, poly features included
X_poly_corrs.sort_values().head(5)

Year Built Age when sold   -0.576737
Lot Shape Age when sold    -0.576363
Land Slope Age when sold   -0.572704
Age when sold              -0.572310
Utilities Age when sold    -0.571855
dtype: float64

In [19]:
# Merge dummy variables back into X
#X = X.merge(X_dummy, left_index = True, right_index = True)

In [594]:
# Remove features with low correlation with sale price
features = list(abs(X.corrwith(y)).sort_values(ascending=False)[:-100].index)
X = X[features]


In [39]:
X.shape

(2047, 232)

In [40]:
# Perform Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=13)

#Standardize training and test set
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [41]:
# Log transform Y_train and y_test
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [42]:
# Import models
from sklearn.linear_model import RidgeCV

In [43]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.

ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5) # 5-fold CV

# Fit model using best ridge alpha!
ridge_cv.fit(X_train_sc, y_train_log);

# This code is from 4.02-lesson-regularization

In [44]:
# Here is the optimal value of alpha
ridge_cv.alpha_

298.364724028334

In [45]:
print('Training score: ', ridge_cv.score(X_train_sc, y_train_log))
print('Test score: ', ridge_cv.score(X_test_sc, y_test_log))

Training score:  0.9297310555948453
Test score:  0.9238775717553867


In [46]:
np.sqrt(mean_squared_error(np.exp(y_train_log), ridge_cv.predict(X_train_sc)))

197256.3498092859

In [47]:
np.sqrt(mean_squared_error(np.exp(y_test_log), ridge_cv.predict(X_test_sc)))

200563.38374070192

In [500]:
# Set up a list of Lasso alphas to check.

l_alphas = np.logspace(-3, 0, 100)

# Cross-validate over our list of Lasso alphas.

lasso_cv = LassoCV(alphas=l_alphas, cv=5, max_iter=50000)

# Fit model using best ridge alpha!

lasso_cv.fit(X_train_sc, y_train_log)

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]),
        cv=5, max_iter=50000)

In [501]:
pd.DataFrame(zip(X.columns, lasso_cv.coef_), columns=['Feature', 'Coef']).sort_values('Coef')

Unnamed: 0,Feature,Coef
90,Bldg Type_Twnhs,-0.013067
33,MS SubClass_30,-0.013020
93,Heating_Grav,-0.012967
60,MS Zoning_C (all),-0.012292
51,Neighborhood_OldTown,-0.010501
...,...,...
10,Year Remod/Add,0.027253
64,Functional,0.030506
19,BsmtFin SF 1,0.031849
0,Overall Qual,0.095568


In [None]:
# fitted attribute
pd.DataFrame({
    'var': X.columns,
    'coef val': lasso.coef_
}).set_index('var').sort_values('coef val', ascending=False).head(5)

In [None]:
# From Caroline today 
coef_df = pd.DataFrame({
    'var': X.columns,
    'coef val': lasso.coef_
})
coef_df[coef_df['coef val'] != 0]['var'].values

In [502]:
# Here is the optimal value of alpha
lasso_cv.alpha_

0.002848035868435802

In [503]:
print('Training score: ', lasso_cv.score(X_train_sc, y_train_log))
print('Test score: ', lasso_cv.score(X_test_sc, y_test_log))

Training score:  0.9161002718912096
Test score:  0.9222363204616989


# Retrain on full dataset

In [630]:
# Retrain the regression on the whole dataset
X_sc = ss.fit_transform(X)
y_log = np.log(y)

# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

r_alphas = np.logspace(0, 5, 100)

# Cross-validate over our list of ridge alphas.

ridge_full = RidgeCV(alphas=r_alphas, scoring='r2', cv=5) # 5-fold CV

# Fit model using best ridge alpha!
ridge_full.fit(X_sc, y_log);

# This code is from 4.02-lesson-regularization

In [631]:
# Here is the optimal value of alpha
ridge_full.alpha_

265.6087782946687

In [541]:
# Get the R2 for the full model
print('Full score: ', ridge_full.score(X_sc, y_log))

Full score:  0.918529550436441


# LOAD TEST DATA

In [632]:
# Load the clean test data
test_clean = pd.read_csv('../datasets/test_clean.csv')
test_clean.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,...,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,69.0,9142,4,4,3,6,8,1910,1950,...,0,0,0,0,0,0,0,0,0,1
1,2718,0.0,9662,3,4,3,5,4,1977,1977,...,0,0,0,0,0,0,0,0,0,1
2,2414,58.0,17104,3,4,3,7,5,2006,2006,...,0,0,0,0,0,0,1,0,0,0
3,1989,60.0,8520,4,4,3,5,6,1923,2006,...,0,0,0,0,0,0,0,0,0,1
4,625,0.0,9500,3,4,3,6,5,1963,1963,...,0,0,0,0,0,0,0,0,0,1


In [633]:
# Drop the Id column for now
test_clean.drop(columns='Id', inplace=True)

In [634]:
# Separate the dummy columns
#test_dummy = test_clean[dummy]
#test_clean.drop(columns=list(dummy), inplace = True)

In [635]:
test_clean.shape

(878, 216)

In [546]:
#Generates the full polynomial feature table.  
poly = PolynomialFeatures(include_bias=False)
test_poly = poly.fit_transform(test_clean)

#Adds appropriate feature names to all polynomial features
test_poly = pd.DataFrame(test_poly,columns=poly.get_feature_names(testset.columns))

test_poly.shape


IndexError: index 116 is out of bounds for axis 0 with size 116

In [609]:
# Merge dummys back in to test_clean
#test_clean = testset.merge(test_dummy, left_index = True, right_index = True)
test_clean.shape

(878, 116)

In [608]:
# Subset the test data to only include features used in regression
test_clean = test_clean[features]

In [636]:
# Scale the test data
test_sc = ss.transform(test_clean)

# Predict sale price for the test data
predictions = ridge_full.predict(test_sc)

# Square the predictions to create predictions that make sense
predictions = np.exp(predictions)

In [637]:
# Read test data back in to get ID column
id_file = pd.read_csv('../datasets/test_clean.csv')
id_file['SalePrice'] = predictions

# Create submission data and save
submission = id_file[['Id','SalePrice']]
submission.to_csv('../datasets/ridge_216_full.csv',index=False)



In [638]:
# View submission data
submission.head()

Unnamed: 0,Id,SalePrice
0,2658,125496.40526
1,2718,161244.088443
2,2414,218451.226317
3,1989,98013.132888
4,625,171917.193723
