In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV


plt.figure(figsize = (12,9))
sns.set_theme(style="ticks")
sns.set_palette("Set1", 8, .75)
sns.set_context("talk", rc={"lines.linewidth": 2})

<Figure size 864x648 with 0 Axes>

Let's load the cleaned test data.

In [2]:
feature1_train = pd.read_csv('./datasets/df_train_cleaned.csv')
feature1_test = pd.read_csv('./datasets/df_test_cleaned.csv')

In [3]:
feature1_train.shape

(2038, 47)

In [4]:
feature1_test.shape

(878, 47)

In [5]:
feature1_train.describe()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,neighborhood,overall_qual,overall_cond,year_remod/add,mas_vnr_area,...,bldg_type_1Fam,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE,exterior_3rd,bsmt_qual_cond_merged,bsmtfin_type_merged,bsmtfin_sf_merged,garage_qual_cond_merged
count,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,...,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0,2038.0
mean,57.127085,68.963796,9992.582434,3.599117,3.998528,2.439647,6.091757,5.565751,1984.052993,96.120859,...,0.827772,0.022571,0.036801,0.033857,0.078999,2.752699,6.409715,4.827772,484.682532,5.610402
std,42.905372,21.261635,6615.808437,0.56466,0.049522,0.921056,1.406877,1.1071,21.0305,167.53568,...,0.377671,0.148568,0.188319,0.180905,0.269803,1.049899,1.337295,2.418281,474.843706,1.423324
min,20.0,21.0,1300.0,1.0,2.0,1.0,1.0,1.0,1950.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,60.0,7500.0,3.0,4.0,2.0,5.0,5.0,1964.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,6.0,2.0,0.0,6.0
50%,50.0,69.0552,9378.5,4.0,4.0,2.0,6.0,5.0,1993.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,7.0,5.0,453.0,6.0
75%,70.0,78.0,11452.75,4.0,4.0,3.0,7.0,6.0,2004.0,150.0,...,1.0,0.0,0.0,0.0,0.0,4.0,7.0,7.0,788.75,6.0
max,190.0,313.0,159000.0,4.0,4.0,4.0,10.0,9.0,2010.0,1600.0,...,1.0,1.0,1.0,1.0,1.0,4.0,9.0,11.0,5644.0,10.0


In [6]:
feature1_test.describe()

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,neighborhood,overall_qual,overall_cond,year_remod/add,...,bldg_type_1Fam,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE,exterior_3rd,bsmt_qual_cond_merged,bsmtfin_type_merged,bsmtfin_sf_merged,garage_qual_cond_merged
count,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,...,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0,878.0
mean,1444.749431,58.268793,69.545961,10307.03303,3.602506,3.998861,2.451025,6.050114,5.566059,1984.417995,...,0.824601,0.018223,0.038724,0.036446,0.082005,2.719818,6.35877,4.785877,495.224374,5.605923
std,850.838263,42.235407,21.279162,10002.674602,0.579264,0.033748,0.925623,1.369065,1.128903,20.450725,...,0.380524,0.133834,0.193047,0.187505,0.274528,1.044656,1.347318,2.380974,463.012119,1.378136
min,2.0,20.0,21.0,1477.0,1.0,3.0,1.0,2.0,1.0,1950.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,692.25,20.0,60.0,7297.25,3.0,4.0,2.0,5.0,5.0,1967.0,...,1.0,0.0,0.0,0.0,0.0,2.0,6.0,2.0,0.0,6.0
50%,1433.0,50.0,69.545961,9446.0,4.0,4.0,3.0,6.0,5.0,1992.0,...,1.0,0.0,0.0,0.0,0.0,2.0,6.0,5.0,474.0,6.0
75%,2197.5,70.0,78.0,11589.0,4.0,4.0,3.0,7.0,6.0,2003.0,...,1.0,0.0,0.0,0.0,0.0,4.0,7.0,7.0,822.0,6.0
max,2928.0,190.0,182.0,215245.0,4.0,4.0,4.0,10.0,9.0,2010.0,...,1.0,1.0,1.0,1.0,1.0,4.0,9.0,11.0,2288.0,9.0


In [11]:
x_train = feature1_train.drop(columns =['saleprice'])
y_train = feature1_train['saleprice']
x_test = feature1_test

In [12]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(2038, 46)
(2038,)
(878, 47)


In [13]:
x_train = sm.add_constant(x_train)

In [14]:
sc = StandardScaler()
z_train = sc.fit_transform(x_train)
z_test = sc.transform(x_test)

In [15]:
r_alphas = np.logspace(0,5,100)

In [16]:
ridge_cv = RidgeCV(alphas=r_alphas, scoring='r2', cv=5)

In [17]:
ridge_cv.fit(z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5, scoring='r2')

In [18]:
ridge_predict = ridge_cv.predict(z_train)

In [19]:
ridge_predict_test = ridge_cv.predict(z_test)

In [20]:
feature1_test['saleprice'] = ridge_predict_test

In [21]:
feature1_test

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,neighborhood,overall_qual,overall_cond,year_remod/add,...,bldg_type_2fmCon,bldg_type_Duplex,bldg_type_Twnhs,bldg_type_TwnhsE,exterior_3rd,bsmt_qual_cond_merged,bsmtfin_type_merged,bsmtfin_sf_merged,garage_qual_cond_merged,saleprice
0,2658,190,69.000000,9142,4,4,1,6,8,1950,...,1,0,0,0,0,5,2,0,2,131444.662057
1,2718,90,69.545961,9662,3,4,2,5,4,1977,...,0,1,0,0,2,7,2,0,6,144703.071442
2,2414,60,58.000000,17104,3,4,3,7,5,2006,...,0,0,0,0,4,8,7,554,6,230631.927656
3,1989,30,60.000000,8520,4,4,1,5,6,2006,...,0,0,0,0,2,6,2,0,5,118261.715155
4,625,20,69.545961,9500,3,4,2,6,5,1963,...,0,0,0,0,2,7,5,609,6,178201.903247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,60,80.000000,8000,4,4,3,6,6,1974,...,0,0,0,0,2,6,7,1084,6,200611.569243
874,1234,60,90.000000,14670,4,4,2,6,7,1999,...,0,0,0,0,4,6,5,575,6,227097.682724
875,1373,20,55.000000,8250,4,4,2,5,5,1968,...,0,0,0,0,2,6,6,742,6,131082.812506
876,1672,20,60.000000,9000,4,4,2,4,6,1971,...,0,0,0,0,2,6,6,616,6,107544.231695


In [22]:
ridge_submission = feature1_test[['id','saleprice']]

In [23]:
ridge_submission

Unnamed: 0,id,saleprice
0,2658,131444.662057
1,2718,144703.071442
2,2414,230631.927656
3,1989,118261.715155
4,625,178201.903247
...,...,...
873,1662,200611.569243
874,1234,227097.682724
875,1373,131082.812506
876,1672,107544.231695
