# First Submission

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [5]:
train.shape

(2051, 81)

In [6]:
test.shape

(879, 80)

In [7]:
train.isnull().sum()

Id                   0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       330
Lot Area             0
Street               0
Alley             1911
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type        22
Mas Vnr Area        22
Exter Qual           0
Exter Cond           0
                  ... 
Half Bath            0
Bedroom AbvGr        0
Kitchen AbvGr        0
Kitchen Qual         0
TotRms AbvGrd        0
Functional           0
Fireplaces           0
Fireplace Qu      1000
Garage Type        113
Garage Yr Blt      114
Garage Finish      114
Garage Cars          1
Garage Area

In [8]:
features = ['Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built',
            'Year Remod/Add', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
            'Gr Liv Area', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 
            'Kitchen AbvGr', 'TotRms AbvGrd', 'Wood Deck SF', 'Open Porch SF', 
            'Pool Area']

In [9]:
y = train['SalePrice']
X = train[features]

In [10]:
mlr = LinearRegression()

In [11]:
mlr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
predictions = mlr.predict(X)

In [13]:
metrics.mean_squared_error(y, predictions)

rmse = np.sqrt(metrics.mean_squared_error(y, predictions))
rmse

35629.58430639573

In [14]:
first_submission = pd.DataFrame()

first_submission['Id'] = test['Id']
first_submission['SalePrice'] = mlr.predict(test[features])

In [15]:
first_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,140668.912393
1,2718,168352.685129
2,2414,209495.347031
3,1989,109410.302331
4,625,166893.197599


In [16]:
first_submission.to_csv('./datasets/first_submission.csv', index = False)

# Second Submission

In [17]:
train.corr().sort_values('SalePrice', ascending = False)

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
SalePrice,-0.051398,-0.255052,-0.087335,0.341842,0.296566,0.800207,-0.097019,0.571849,0.55037,0.51223,...,0.32649,0.333476,-0.135656,0.048732,0.134581,0.023106,-0.007375,0.032735,-0.015203,1.0
Overall Qual,-0.061483,-0.265863,0.035763,0.194808,0.105824,1.0,-0.08277,0.602964,0.584654,0.438685,...,0.257081,0.308855,-0.154554,0.031938,0.048752,0.006558,0.022099,0.019242,-0.011578,0.800207
Gr Liv Area,-0.023881,-0.112936,0.06821,0.383856,0.327427,0.566701,-0.109804,0.258838,0.322407,0.387833,...,0.270239,0.345008,-0.007777,0.024504,0.101926,0.113034,0.113673,0.049644,-0.015891,0.697038
Garage Area,-0.045201,-0.200891,-0.108228,0.358457,0.263296,0.563814,-0.137917,0.487177,0.398999,0.384718,...,0.245513,0.247222,-0.112353,0.043918,0.071067,0.045876,0.033641,0.009964,-0.003589,0.65027
Garage Cars,-0.048666,-0.228368,-0.049148,0.297143,0.214954,0.587423,-0.168513,0.542544,0.441796,0.361211,...,0.240721,0.215364,-0.139608,0.028393,0.053582,0.021566,0.003053,0.020801,-0.009996,0.64822
Total Bsmt SF,-0.038346,-0.203795,-0.226038,0.356584,0.277175,0.548742,-0.158643,0.410254,0.308545,0.408575,...,0.224659,0.257227,-0.088055,0.060283,0.076061,0.061503,0.1256,0.003915,-0.00428,0.628925
1st Flr SF,-0.02265,-0.145862,-0.246212,0.4639,0.381593,0.477136,-0.150938,0.323315,0.24419,0.391349,...,0.226243,0.259072,-0.044032,0.066991,0.08914,0.129995,0.139496,0.027038,-0.007432,0.618486
Year Built,-0.064444,-0.347039,0.035983,0.109504,0.036002,0.602964,-0.370988,1.0,0.629116,0.329741,...,0.216339,0.207798,-0.380082,0.016104,-0.037866,0.003728,0.000626,-0.007083,-0.003559,0.571849
Year Remod/Add,-0.09004,-0.176666,0.044836,0.085052,0.050771,0.584654,0.042614,0.629116,1.0,0.211443,...,0.216271,0.264476,-0.237523,0.040416,-0.041211,-0.022382,-0.001704,0.011568,0.042744,0.55037
Full Bath,-0.059086,-0.17937,0.142087,0.152812,0.125601,0.51508,-0.219189,0.480169,0.471555,0.240034,...,0.175502,0.244452,-0.125383,0.02787,-0.014296,0.010458,0.003293,0.04939,0.00714,0.537969


In [18]:
features_2 = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Garage Cars', 'Total Bsmt SF', '1st Flr SF',
              'Year Built', 'Year Remod/Add', 'Full Bath', 'Garage Yr Blt', 'Mas Vnr Area', 'TotRms AbvGrd']

In [19]:
train[features_2].dtypes

Overall Qual        int64
Gr Liv Area         int64
Garage Area       float64
Garage Cars       float64
Total Bsmt SF     float64
1st Flr SF          int64
Year Built          int64
Year Remod/Add      int64
Full Bath           int64
Garage Yr Blt     float64
Mas Vnr Area      float64
TotRms AbvGrd       int64
dtype: object

In [20]:
train['Garage Area'].fillna(0, inplace = True)
test['Garage Area'].fillna(0, inplace = True)

In [21]:
train['Garage Cars'].fillna(0, inplace = True)
test['Garage Cars'].fillna(0, inplace = True)

In [22]:
train['Garage Yr Blt'].fillna(0, inplace = True)
test['Garage Yr Blt'].fillna(0, inplace = True)

In [23]:
train['Mas Vnr Area'].fillna(0, inplace = True)
test['Mas Vnr Area'].fillna(0, inplace = True)

In [24]:
train['Total Bsmt SF'].fillna(0, inplace = True)
test['Total Bsmt SF'].fillna(0, inplace = True)

In [25]:
X_2 = train[features_2]

In [26]:
mlr.fit(X_2, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [27]:
predictions_2 = mlr.predict(X_2)

In [28]:
metrics.mean_squared_error(y, predictions_2)

1260387350.3800466

In [29]:
rmse2 = np.sqrt(metrics.mean_squared_error(y, predictions_2))
rmse2

35501.934459688906

In [30]:
second_submission = pd.DataFrame()

In [31]:
second_submission['Id'] = test['Id']
second_submission['SalePrice'] = mlr.predict(test[features_2])

In [32]:
second_submission.to_csv('./datasets/second_submission.csv', index = False)

# Third Submission

In [33]:
train_clean = pd.read_csv('./datasets/clean_train.csv')
test_clean = pd.read_csv('./datasets/clean_test.csv')

In [34]:
train_clean.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,fence,misc_val,mo_sold,yr_sold,sale_type,saleprice,total_baths,total_sf,age,neighborhood_qual
0,109,533352170,60,RL,0.0,13517,Pave,IR1,Lvl,AllPub,...,No Fence,0,3,2010,WD,130500,2.5,2204.0,34,1
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,No Fence,0,4,2009,WD,220000,3.5,3035.0,13,2
2,153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,...,No Fence,0,1,2010,WD,109000,2.0,2114.0,57,1
3,318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,...,No Fence,0,4,2010,WD,174000,2.5,1828.0,4,2
4,255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,...,No Fence,0,3,2010,WD,138500,2.0,2121.0,110,2


In [35]:
y_3 = train_clean['saleprice']

In [36]:
train_clean.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'lot_shape', 'land_contour', 'utilities', 'lot_config',
       'land_slope', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type',
       'house_style', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod/add', 'roof_style', 'roof_matl', 'exterior_1st',
       'exterior_2nd', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual',
       'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_type_2', 'total_bsmt_sf', 'heating',
       'heating_qc', 'central_air', 'electrical', 'gr_liv_area',
       'bedroom_abvgr', 'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd',
       'functional', 'fireplaces', 'fireplace_qu', 'garage_type',
       'garage_yr_blt', 'garage_finish', 'garage_area', 'garage_qual',
       'garage_cond', 'paved_drive', 'wood_deck_sf', 'open_porch_sf',
       'enclosed_porch', '3ssn_porch', 'screen_porch', 'pool_area', 'fe

In [37]:
train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 71 columns):
id                   2025 non-null int64
pid                  2025 non-null int64
ms_subclass          2025 non-null int64
ms_zoning            2025 non-null object
lot_frontage         2025 non-null float64
lot_area             2025 non-null int64
street               2025 non-null object
lot_shape            2025 non-null object
land_contour         2025 non-null object
utilities            2025 non-null object
lot_config           2025 non-null object
land_slope           2025 non-null object
neighborhood         2025 non-null object
condition_1          2025 non-null object
condition_2          2025 non-null object
bldg_type            2025 non-null object
house_style          2025 non-null object
overall_qual         2025 non-null int64
overall_cond         2025 non-null int64
year_built           2025 non-null int64
year_remod/add       2025 non-null int64
roof_style        

In [38]:
features_3 = ['overall_qual', 'total_sf', 'gr_liv_area', 'exter_qual', 'kitchen_qual',
              'total_baths', 'total_bsmt_sf', 'garage_area', 'bsmt_qual', 'year_built', 'garage_finish',
              'totrms_abvgrd']

In [39]:
X_3 = train_clean[features_3]

In [40]:
mlr.fit(X_3, y_3)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [41]:
predictions_3 = mlr.predict(X_3)

In [42]:
metrics.mean_squared_error(y_3, predictions_3)

968866934.9668229

In [43]:
np.sqrt(metrics.mean_squared_error(y_3, predictions_3))

31126.627426800078

In [44]:
test.columns = test.columns.str.lower().str.replace(' ', '_')

In [45]:
third_submission = pd.DataFrame()
third_submission['Id'] = test_clean['id']
third_submission['SalePrice'] = mlr.predict(test_clean[features_3])

In [46]:
test.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [47]:
third_submission.to_csv('./datasets/third_submission.csv', index = False)

In [48]:
third_submission.shape

(879, 2)

# Fourth Submission

In [49]:
ss = StandardScaler()

In [50]:
X_scaled = ss.fit_transform(X_3)

In [51]:
y_4 = train_clean[['saleprice']]

In [52]:
mlr.fit(X_scaled, y_4)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
predictions_4 = mlr.predict(X_scaled)

In [54]:
mse = metrics.mean_squared_error(y_4, predictions_4)
mse

968866934.966823

In [55]:
rmse = np.sqrt(mse)
rmse

31126.62742680008

In [56]:
fourth_submission = pd.DataFrame()
fourth_submission['Id'] = test_clean['id']
fourth_submission['SalePrice'] = mlr.predict(test_clean[features_3])

In [57]:
fourth_submission.to_csv('./datasets/fourth_submission.csv', index = False)

In [58]:
fourth_submission.shape

(879, 2)

# Fifth Submission

In [59]:
train_with_dummies = pd.read_csv('./datasets/train_with_dummies.csv')
test_with_dummies = pd.read_csv('./datasets/test_with_dummies.csv')

In [60]:
lr = LinearRegression()

In [61]:
X = train_with_dummies.drop('saleprice', axis = 1)
y = train_with_dummies['saleprice']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [63]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [64]:
lr_predictions = lr.predict(X)

In [65]:
metrics.mean_squared_error(y, lr_predictions)

rmse = np.sqrt(metrics.mean_squared_error(y, lr_predictions))
rmse

22147.128002086658

In [66]:
fifth_submission = pd.DataFrame()
fifth_submission['Id'] = test_with_dummies['id']
fifth_submission['SalePrice'] = lr.predict(test_with_dummies)

In [67]:
fifth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,106626.445902
1,2718,294678.384508
2,2414,298278.852027
3,1989,209077.47367
4,625,416830.885721


In [68]:
fifth_submission.to_csv('./datasets/fifth_submission.csv', index = False)

# Sixth Submission

In [69]:
ridge = RidgeCV(cv = 5)

In [70]:
ridge.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [71]:
r_preds = ridge.predict(X)

In [72]:
metrics.mean_squared_error(y, r_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, r_preds))
rmse

22245.796864139345

In [73]:
sixth_submission = pd.DataFrame()
sixth_submission['Id'] = test_with_dummies['id']
sixth_submission['SalePrice'] = ridge.predict(test_with_dummies)

In [74]:
sixth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,150642.186229
1,2718,172920.428484
2,2414,217063.256195
3,1989,143864.355742
4,625,211512.998027


In [75]:
sixth_submission.to_csv('./datasets/sixth_submission.csv', index = False)

# Seventh Submission

In [76]:
lasso = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [77]:
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [78]:
l_preds = lasso.predict(X)

In [79]:
metrics.mean_squared_error(y, l_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, l_preds))
rmse

76581.94144216384

In [80]:
seventh_submission = pd.DataFrame()
seventh_submission['Id'] = test_with_dummies['id']
seventh_submission['SalePrice'] = lasso.predict(test_with_dummies)

In [81]:
seventh_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,162169.055511
1,2718,161913.459965
2,2414,196232.100752
3,1989,162177.61218
4,625,195604.990787


In [82]:
seventh_submission.to_csv('./datasets/seventh_submission.csv', index = False)

# Eighth Submission

In [83]:
train = pd.read_csv('./datasets/train_with_dummies.csv')
test = pd.read_csv('./datasets/test_with_dummies.csv')

In [84]:
ss = StandardScaler()
lasso_scaled = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [85]:
features = [x for x in train.columns if x not in ['saleprice', 'id', 'pid']]

In [86]:
X = train[features]
y = train['saleprice']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [88]:
X.shape

(2025, 225)

In [89]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)
test_sc = ss.transform(test[features])

In [90]:
lasso_scaled.fit(X_train_sc, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [91]:
lasso_scaled.score(X_train_sc, y_train)

0.9332108842747361

In [92]:
lasso_scaled.score(X_test_sc, y_test)

0.8889504494831244

In [93]:
las_sc_preds = lasso_scaled.predict(X)

In [94]:
metrics.mean_squared_error(y, las_sc_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, las_sc_preds))
rmse

131676260.3627188

In [95]:
eighth_submission = pd.DataFrame()
eighth_submission['Id'] = test['id']
eighth_submission['SalePrice'] = lasso_scaled.predict(test_sc)

In [96]:
eighth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,141975.909045
1,2718,167574.275732
2,2414,232936.701901
3,1989,115394.761106
4,625,183342.131131


In [97]:
eighth_submission.to_csv('./datasets/eighth_submission.csv', index = False)

# Ninth Submission

In [98]:
ridge_scaled = RidgeCV(alphas = np.logspace(0, 5, 200),cv = 5)

In [99]:
ridge_scaled.fit(X_train_sc, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.05956018e+00, 1.12266777e+00, 1.18953407e+00,
       1.26038293e+00, 1.33545156e+00, 1.41499130e+00, 1.49926843e+00,
       1.58856513e+00, 1.68318035e+00, 1.78343088e+00, 1.88965234e+00,
       2.00220037e+00, 2.12145178e+00, 2.24780583e+00, 2.38168555e+00,
       2.52353917e+00, 2.67384162e+00, 2.83309610e+00, 3.00183581e+00,
       3.18062569e+00, 3.37006433e+0...
       3.33129479e+04, 3.52970730e+04, 3.73993730e+04, 3.96268864e+04,
       4.19870708e+04, 4.44878283e+04, 4.71375313e+04, 4.99450512e+04,
       5.29197874e+04, 5.60716994e+04, 5.94113398e+04, 6.29498899e+04,
       6.66991966e+04, 7.06718127e+04, 7.48810386e+04, 7.93409667e+04,
       8.40665289e+04, 8.90735464e+04, 9.43787828e+04, 1.00000000e+05]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [100]:
ridge_scaled.score(X_train_sc, y_train)

0.9357242564676271

In [101]:
ridge_scaled.score(X_test_sc, y_test)

0.8876572546228901

In [102]:
ridge_sc_preds = ridge_scaled.predict(X)

In [103]:
metrics.mean_squared_error(y, ridge_sc_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, ridge_sc_preds))
rmse

113650386.40913734

In [104]:
ninth_submission = pd.DataFrame()
ninth_submission['Id'] = test['id']
ninth_submission['SalePrice'] = ridge_scaled.predict(test_sc)

In [105]:
ninth_submission.to_csv('./datasets/ninth_submission.csv', index = False)

# Tenth Submission

In [106]:
X = train[features]
y = np.log(train['saleprice'])

In [107]:
lr = LinearRegression()

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [109]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [110]:
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.8987007766195765

In [111]:
cross_val_score(lr, X_test, y_test, cv = 5).mean()

0.8507960815732613

In [112]:
lr_preds = lr.predict(X)

In [113]:
metrics.mean_squared_error(y, lr_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lr_preds))
rmse

0.10881074060259519

In [114]:
tenth_submission = pd.DataFrame()
tenth_submission['Id'] = test['id']
tenth_submission['SalePrice'] = np.exp(lr.predict(test[features]))

In [115]:
tenth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,123550.045844
1,2718,162347.014343
2,2414,224250.525122
3,1989,96328.314983
4,625,174306.831047


In [116]:
tenth_submission.to_csv('./datasets/tenth_submission.csv', index = False)

# Eleventh Submission
## This was my highest scoring submission on Kaggle

In [117]:
ridge = RidgeCV(cv = 5)

In [118]:
ridge.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [119]:
ridge.score(X_train, y_train)

0.9315752037627588

In [120]:
ridge.score(X_test, y_test)

0.9167625109306345

In [121]:
ridge_preds = ridge.predict(X)

In [122]:
metrics.mean_squared_error(y, ridge_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, ridge_preds))
rmse

0.11061836325825875

In [123]:
eleventh_submission = pd.DataFrame()
eleventh_submission['Id'] = test['id']
eleventh_submission['SalePrice'] = np.exp(ridge.predict(test[features]))

In [124]:
eleventh_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,124896.247013
1,2718,163402.297115
2,2414,225726.873663
3,1989,99125.36802
4,625,172586.970699


In [125]:
eleventh_submission.to_csv('./datasets/eleventh_submission.csv', index = False)

# Twelfth Submission

In [126]:
lasso = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [127]:
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [128]:
lasso.score(X_train, y_train)

0.8205639498300938

In [129]:
lasso.score(X_test, y_test)

0.8053035665830293

In [130]:
lass_preds = lasso.predict(X)

In [131]:
metrics.mean_squared_error(y, lass_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lass_preds))
rmse

0.1761396947500164

In [132]:
twelfth_submission = pd.DataFrame()
twelfth_submission['Id'] = test['id']
twelfth_submission['SalePrice'] = np.exp(lasso.predict(test[features]))

In [133]:
twelfth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,148021.82921
1,2718,256112.722351
2,2414,177665.048269
3,1989,124152.541032
4,625,171839.961132


In [134]:
twelfth_submission.to_csv('./datasets/twelfth_submission.csv', index = False)

# Thirteenth Submission

In [135]:
ridge_sc = RidgeCV(cv = 5)
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)
test_sc = ss.transform(test[features])

In [136]:
ridge_sc.fit(X_train_sc, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [137]:
ridge_sc.score(X_train_sc, y_train)

0.9383816554193642

In [138]:
ridge_sc.score(X_test_sc, y_test)

0.9079236625513272

In [139]:
ridge_sc_preds = ridge_sc.predict(X)

In [140]:
metrics.mean_squared_error(y, ridge_sc_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, ridge_sc_preds))
rmse

577.7157344551489

In [141]:
thirteenth_submission = pd.DataFrame()
thirteenth_submission['Id'] = test['id']
thirteenth_submission['SalePrice'] = np.exp(ridge_sc.predict(test_sc))

In [142]:
thirteenth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,126006.300442
1,2718,162712.631143
2,2414,224394.064275
3,1989,96560.063878
4,625,173956.97205


In [143]:
thirteenth_submission.to_csv('./datasets/thirteenth_submission.csv', index = False)

# Fourteenth Submission

In [144]:
lasso_sc = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [145]:
lasso_sc.fit(X_train_sc, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [146]:
lasso_sc.score(X_train_sc, y_train)

0.9308118716342961

In [147]:
lasso_sc.score(X_test_sc, y_test)

0.9133396350241196

In [148]:
lasso_sc_preds = lasso_sc.predict(X)

In [149]:
metrics.mean_squared_error(y, lasso_sc_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lasso_sc_preds))
rmse

652.488178928581

In [150]:
fourteenth_submission = pd.DataFrame()
fourteenth_submission['Id'] = test['id']
fourteenth_submission['SalePrice'] = np.exp(lasso_sc.predict(test_sc))

In [151]:
fourteenth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,139908.922331
1,2718,161513.941239
2,2414,221609.363924
3,1989,99127.359232
4,625,170321.915931


In [152]:
lasso_coefs = pd.DataFrame({'variable':X.columns,
                            'coef':lasso_sc.coef_,
                            'abs_coef':np.abs(lasso_sc.coef_)})

lasso_coefs.sort_values('abs_coef', inplace=True, ascending=False)

lasso_coefs.head(25)

print('Percent variables zeroed out:', np.sum((lasso_sc.coef_ == 0))/float(X.shape[1]))

Percent variables zeroed out: 0.5377777777777778


In [153]:
fourteenth_submission.to_csv('./datasets/fourteenth_submission.csv', index = False)

# Fifteenth Submission

In [154]:
features = ['total_sf', 'overall_qual', 'gr_liv_area', 'exter_qual',
            'overall_cond', 'kitchen_qual', 'age', 'mas_vnr_area',
            'sale_type_New', 'lot_area', 'total_baths']
X = train[features]
y = np.log(train['saleprice'])

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [156]:
X_tr_sc = ss.fit_transform(X_train)
X_te_sc = ss.transform(X_test)
test_sc = ss.transform(test[features])

In [157]:
lr = LinearRegression()

In [158]:
lr.fit(X_tr_sc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [159]:
lr.score(X_tr_sc, y_train)

0.8738104897299719

In [160]:
lr.score(X_te_sc, y_test)

0.899626914037308

In [161]:
lrpreds = lr.predict(X)

In [162]:
metrics.mean_squared_error(y, lrpreds)

rmse = np.sqrt(metrics.mean_squared_error(y, lrpreds))
rmse

896.954790627455

In [163]:
fifteenth_submission = pd.DataFrame()
fifteenth_submission['Id'] = test['id']
fifteenth_submission['SalePrice'] = np.exp(lr.predict(test_sc))

In [164]:
fifteenth_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,158885.277412
1,2718,175552.660704
2,2414,219213.196461
3,1989,105728.072401
4,625,155506.451061


In [165]:
fifteenth_submission.to_csv('./datasets/fifteenth_submission.csv', index = False)