In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel, mutual_info_regression, f_regression

In [3]:
train = pd.read_csv('./train.csv')

In [4]:
test = pd.read_csv('./test.csv')

In [5]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,,,,0,4,2009,WD,Normal,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,,,,0,1,2010,WD,Abnorml,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,,,,0,3,2010,WD,Normal,138500


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [7]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
Id                 879 non-null int64
PID                879 non-null int64
MS SubClass        879 non-null int64
MS Zoning          879 non-null object
Lot Frontage       719 non-null float64
Lot Area           879 non-null int64
Street             879 non-null object
Alley              58 non-null object
Lot Shape          879 non-null object
Land Contour       879 non-null object
Utilities          879 non-null object
Lot Config         879 non-null object
Land Slope         879 non-null object
Neighborhood       879 non-null object
Condition 1        879 non-null object
Condition 2        879 non-null object
Bldg Type          879 non-null object
House Style        879 non-null object
Overall Qual       879 non-null int64
Overall Cond       879 non-null int64
Year Built         879 non-null int64
Year Remod/Add     879 non-null int64
Roof Style         879 non-null object
Roof M

In [9]:
#checking our target variable
train['SalePrice'].describe()

count      2051.000000
mean     181469.701609
std       79258.659352
min       12789.000000
25%      129825.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: SalePrice, dtype: float64

In [10]:
#most of the data transformations were done in the other notebook first
#other than switching which target variable, nothing else should be different
#some commentary may not be copied - check classification notebook for that
#and some verification steps may be skipped, as I'm presuming everything still
#works the same as it does in the other notebook, saving me some steps
train.drop('Sale Condition', axis=1, inplace=True)
#don't need this anymore and Sale Price needs no further transformation

In [11]:
train.drop(['Id', 'PID'], axis = 1, inplace=True)
test.drop(['Id', 'PID'], axis = 1, inplace=True)
print(train.shape)
print(test.shape)

(2051, 79)
(879, 78)


In [12]:
def dummy_var(column):
    if train[column].dtype != str:
        train[column] = train[column].astype(str)
        test[column] = test[column].astype(str)
    #in case categorical data is not a str, as MS SubClass is
    dummy = pd.concat([train[column], test[column]])
    # concats train and test columns
    dummy_df = pd.DataFrame(dummy, columns=[column])
    # keeps column name
    dummy_df = pd.get_dummies(dummy_df)
    #gets dummy variables using pandas
    for dummycol in dummy_df.columns:
        #for some reason, still getting NaN dummy variables despite that being defaulted to false
        if 'nan' not in dummycol:
            train[dummycol] = dummy_df[dummycol].iloc[0:len(train)]
            test[dummycol] = dummy_df[dummycol].iloc[len(train):(len(dummy_df))]
        else:
            dummy_df.drop(dummycol, axis=1, inplace=True)
    train.drop(column, axis=1, inplace=True)
    test.drop(column, axis=1, inplace=True)
    #drops the original variables, as they're no longer needed for analysis
    print("Dummied " + column)
    return dummy_df
    #prints as a double-check, then returns
MSSub_dummy = dummy_var('MS SubClass')
#using MS SubClass as an example, as it's the first entry in the dataset and is categorical

Dummied MS SubClass


In [13]:
print(train.shape)
print(test.shape)
train.head()

(2051, 94)
(879, 93)


Unnamed: 0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,MS SubClass_70,MS SubClass_75,MS SubClass_80,MS SubClass_85,MS SubClass_90
0,RL,,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,...,0,0,0,0,1,0,0,0,0,0
1,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,...,0,0,0,0,1,0,0,0,0,0
2,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,0,0
3,RL,73.0,9802,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,1,0,0,0,0,0
4,RL,82.0,14235,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,1,0,0,0,0,0,0


In [14]:
test.head()

Unnamed: 0,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,MS SubClass_70,MS SubClass_75,MS SubClass_80,MS SubClass_85,MS SubClass_90
0,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,0,0
1,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,0,1
2,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,1,0,0,0,0,0
3,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,...,1,0,0,0,0,0,0,0,0,0
4,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,0,0,0,0


In [15]:
categories = ['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
             'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
             'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 
             'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1',
             'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual',
             'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond',
             'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type']
#dummy them all out
for c in categories:
    dummy_var(c)
print(train.shape)
print(test.shape)

Dummied MS Zoning
Dummied Street
Dummied Alley
Dummied Lot Shape
Dummied Land Contour
Dummied Utilities
Dummied Lot Config
Dummied Land Slope
Dummied Neighborhood
Dummied Condition 1
Dummied Condition 2
Dummied Bldg Type
Dummied House Style
Dummied Roof Style
Dummied Roof Matl
Dummied Exterior 1st
Dummied Exterior 2nd
Dummied Mas Vnr Type
Dummied Exter Qual
Dummied Exter Cond
Dummied Foundation
Dummied Bsmt Qual
Dummied Bsmt Cond
Dummied Bsmt Exposure
Dummied BsmtFin Type 1
Dummied BsmtFin Type 2
Dummied Heating
Dummied Heating QC
Dummied Central Air
Dummied Electrical
Dummied Kitchen Qual
Dummied Functional
Dummied Fireplace Qu
Dummied Garage Type
Dummied Garage Finish
Dummied Garage Qual
Dummied Garage Cond
Dummied Paved Drive
Dummied Pool QC
Dummied Fence
Dummied Misc Feature
Dummied Sale Type
(2051, 314)
(879, 313)


In [16]:
#copying null replacements from other notebook
col = ['BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
for c in col:
    train.loc[1327, c] = 0

In [17]:
train.loc[616, 'Bsmt Full Bath'] = 0
train.loc[616, 'Bsmt Half Bath'] = 0

In [18]:
basement = ['BsmtFin SF 1','BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath']
for b in basement:
    train[basement] = train[basement].astype(int)
    test[basement] = test[basement].astype(int)

In [19]:
train.loc[1699, 'Garage Yr Blt'] = 2007

In [20]:
train.loc[1712, 'Garage Cars'] = 0
train.loc[1712, 'Garage Area'] = 0

In [21]:
train['Garage Cars'] = train['Garage Cars'].astype(int)
train['Garage Area'] = train['Garage Area'].astype(int)
test['Garage Cars'] = test['Garage Cars'].astype(int)
test['Garage Area'] = test['Garage Area'].astype(int)

In [22]:
for l in range(len(train)):
    if train['Garage Yr Blt'].isnull()[l]:
        train.loc[l, 'Garage Yr Blt'] = train.loc[l, 'Year Built']
for l in range(len(test)):
    if test['Garage Yr Blt'].isnull()[l]:
        test.loc[l, 'Garage Yr Blt'] = test.loc[l, 'Year Built']
train['Garage Yr Blt'] = train['Garage Yr Blt'].astype(int)
test['Garage Yr Blt'] = test['Garage Yr Blt'].astype(int)

In [23]:
for l in range(len(train)):
    if train['Lot Frontage'].isnull()[l]:
        train.loc[l, 'Lot Frontage'] = round(train.loc[l, 'Lot Area'] / 140)
for l in range(len(test)):
    if test['Lot Frontage'].isnull()[l]:
        test.loc[l, 'Lot Frontage'] = round(test.loc[l, 'Lot Area'] / 140)
train['Lot Frontage'] = train['Lot Frontage'].astype(int)
test['Lot Frontage'] = test['Lot Frontage'].astype(int)

In [24]:
train['Mas Vnr Area'].fillna(0, inplace = True)
test['Mas Vnr Area'].fillna(0, inplace = True)
#double checking nulls
train_has_nulls = {}
test_has_nulls = {}
for c in train.columns:
    if train[c].isnull().sum() > 0:
        train_has_nulls[c] = train[c].isnull().sum()
for c in test.columns:
    if test[c].isnull().sum() > 0:
        test_has_nulls[c] = test[c].isnull().sum()
print(train_has_nulls)
print(test_has_nulls)

{}
{}


In [25]:
train['Mas Vnr Area'] = train['Mas Vnr Area'].astype(int)
test['Mas Vnr Area'] = test['Mas Vnr Area'].astype(int)
train.drop('Mas Vnr Type_None', axis=1, inplace=True)
test.drop('Mas Vnr Type_None', axis=1, inplace=True)

In [26]:
#double checking everything
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Columns: 313 entries, Lot Frontage to Sale Type_WD 
dtypes: int64(36), uint8(277)
memory usage: 1.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Columns: 312 entries, Lot Frontage to Sale Type_WD 
dtypes: int64(35), uint8(277)
memory usage: 478.2 KB


In [27]:
#last bit of set up, then things should be different between the two
X = train.copy().drop('SalePrice', axis=1)
y = train['SalePrice']
X_train, X_hold, y_train, y_hold = train_test_split(X, y, random_state=777)
X_test = test.copy()

ss = StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_hold_scaled = ss.transform(X_hold)
X_test_scaled = ss.transform(X_test)

In [28]:
#importing regressions and regularizations
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elasnet = ElasticNet()

In [86]:
#for giggles, I suppose we can start with a basic linear regression,
#even though I know it will be terrible with this many variables
lr.fit(X_train_scaled, y_train)
lr.score(X_hold_scaled, y_hold)

-8.2724036163552254e+22

In [110]:
#like I thought - so let's start making improvements with grid search and regularization, shall we?
params = {'alpha': np.arange(.001, .145, .012)}
l_gs = GridSearchCV(lasso, param_grid=params)
l_gs.fit(X_train, y_train)
print(l_gs.best_score_)
l_gs.best_params_





0.81099568721




{'alpha': 0.037000000000000005}

In [111]:
#this time avoiding the convergence warnings
print(l_gs.best_score_)
l_gs.best_params_

0.81099568721


{'alpha': 0.037000000000000005}

In [112]:
lasso_model = Lasso(alpha=l_gs.best_params_['alpha'])
lasso_model.fit(X_train_scaled, y_train)



Lasso(alpha=0.037000000000000005, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [92]:
lasso_model.score(X_hold_scaled, y_hold)

0.91048947966434302

In [93]:
#much better. Ridge?
params = {'alpha': np.logspace(0, 5, 200)}
r_gs = GridSearchCV(ridge, param_grid=params)
r_gs.fit(X_train, y_train)
print(r_gs.best_score_)
r_gs.best_params_

0.841019677368


{'alpha': 5.0526310653356807}

In [94]:
ridge_model = Ridge(alpha=r_gs.best_params_['alpha'])
ridge_model.fit(X_train_scaled, y_train)

Ridge(alpha=5.0526310653356807, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)

In [95]:
ridge_model.score(X_hold_scaled, y_hold)

0.91594899933439922

In [96]:
#let's try Elastic Net next
params = {'alpha': np.arange(.01, 1.0, .033), 'l1_ratio': [.1, .3, .5, .7, .9, .95, .99, 1]}
en_gs = GridSearchCV(elasnet, param_grid=params)
en_gs.fit(X_train, y_train)
print(en_gs.best_score_)
en_gs.best_params_













































0.841037044044




{'alpha': 0.505, 'l1_ratio': 0.99}

In [97]:
#again, without convergence warnings
print(en_gs.best_score_)
en_gs.best_params_

0.841037044044


{'alpha': 0.505, 'l1_ratio': 0.99}

In [98]:
elastic_model = ElasticNet(alpha=en_gs.best_params_['alpha'], l1_ratio=en_gs.best_params_['l1_ratio'])
elastic_model.fit(X_train_scaled, y_train)

ElasticNet(alpha=0.505, copy_X=True, fit_intercept=True, l1_ratio=0.99,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [99]:
elastic_model.score(X_hold_scaled, y_hold)

0.91685267008751314

In [48]:
predictions = pd.read_csv('./test.csv')

In [142]:
lasso_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
lasso_pred['SalePrice'] = lasso_model.predict(X_test_scaled)
lasso_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,131114.652722
1,2718,154385.666337
2,2414,213755.099615
3,1989,106538.665623
4,625,167951.586119


In [143]:
ridge_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
ridge_pred['SalePrice'] = ridge_model.predict(X_test_scaled)
ridge_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,129312.132784
1,2718,154347.325733
2,2414,214060.872274
3,1989,106580.147615
4,625,168310.147231


In [144]:
elastic_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
elastic_pred['SalePrice'] = elastic_model.predict(X_test_scaled)
elastic_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,128614.673989
1,2718,154401.226113
2,2414,214183.333721
3,1989,106601.6972
4,625,168516.606552


In [145]:
lasso_pred.to_csv('lasso-model.csv', index=False)
ridge_pred.to_csv('ridge-model.csv', index=False)
elastic_pred.to_csv('elastic-model.csv', index=False)

In [43]:
#now let's try with feature selection! We'll start with Select K Best
#grid score doesn't work with SKB, so we'll try good old fashioned iteration
k_score = {}
for k in range(1, len(X_train.columns)):
    skb = SelectKBest(score_func = f_regression, k = k)
    skb.fit(X_train_scaled, y_train)
    lr.fit(skb.transform(X_train_scaled), y_train)
    k_score[k] = lr.score(skb.transform(X_hold_scaled), y_hold)
max(k_score.values())

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0.88934653391632545

In [44]:
max(k_score, key=k_score.get)

149

In [51]:
skb = SelectKBest(score_func = f_regression, k = 149)
skb.fit(X_train_scaled, y_train)
lr.fit(skb.transform(X_train_scaled), y_train)
skb_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
skb_pred['SalePrice'] = lr.predict(skb.transform(X_test_scaled))
skb_pred.head()

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Unnamed: 0,Id,SalePrice
0,2658,163647.611705
1,2718,143228.302792
2,2414,223306.209393
3,1989,114771.671964
4,625,179650.947813


In [52]:
skb_pred.to_csv('skb-model.csv', index=False)

In [53]:
#previous model scored incredibly poorly on Kaggle, and ~150 features seems like
#a lot, so instead, I decided to try again, just picking a more reasonable value
#30 is roughly 10% of all variables, so that seems a reasonable decision
skb = SelectKBest(score_func = f_regression, k = 30)
skb.fit(X_train_scaled, y_train)
lr.fit(skb.transform(X_train_scaled), y_train)
lr.score(skb.transform(X_hold_scaled), y_hold)

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0.8595606749434338

In [54]:
skb30_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
skb30_pred['SalePrice'] = lr.predict(skb.transform(X_test_scaled))
skb30_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,147508.050881
1,2718,173415.304724
2,2414,216337.924508
3,1989,124800.199671
4,625,177779.713074


In [55]:
skb30_pred.to_csv('skb-model-2.csv', index=False)
#this scored way better - is it an overfitting problem?

In [61]:
#next, select percentile
SP = SelectPercentile(score_func=mutual_info_regression,percentile = 0.1)
SP.fit(X_train_scaled, y_train)



SelectPercentile(percentile=0.1,
         score_func=<function mutual_info_regression at 0x110743950>)

In [79]:
SP_scores = pd.DataFrame(data = SP.scores_, columns=['Score'])
SP_scores.sort_values('Score',ascending = False)
SP_scores[SP_scores['Score'] > 0.1].shape
#40 features seems about right

(40, 1)

In [82]:
SP_scores = SP_scores[SP_scores['Score'] > 0.1]
lr.fit(X_train_scaled[:,SP_scores.index], y_train)
lr.score(X_hold_scaled[:,SP_scores.index], y_hold)

0.86414574534615807

In [84]:
spct_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
spct_pred['SalePrice'] = lr.predict(X_test_scaled[:,SP_scores.index])
spct_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,146103.812927
1,2718,165961.09531
2,2414,224558.954716
3,1989,116546.199928
4,625,175242.74315


In [85]:
spct_pred.to_csv('select-percentile.csv', index=False)

In [103]:
#next, select from model, starting with our best scoring, Elastic
sfm = SelectFromModel(elastic_model)
sfm.fit(X_train_scaled, y_train)

SelectFromModel(estimator=ElasticNet(alpha=0.505, copy_X=True, fit_intercept=True, l1_ratio=0.99,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [104]:
lr.fit(X_train_scaled[:,sfm.get_support(indices=True)], y_train)
lr.score(X_hold_scaled[:,sfm.get_support(indices=True)], y_hold)

0.91832798273196903

In [105]:
sfm_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
sfm_pred['SalePrice'] = lr.predict(X_test_scaled[:,sfm.get_support(indices=True)])
sfm_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,118878.638303
1,2718,159354.451419
2,2414,217629.159262
3,1989,114229.212237
4,625,171877.869684


In [106]:
sfm_pred.to_csv('selectfrommodel.csv', index=False)

In [113]:
#let's try the other models. Lasso?
sfm_l = SelectFromModel(lasso_model)
sfm_l.fit(X_train_scaled, y_train)
lr.fit(X_train_scaled[:,sfm_l.get_support(indices=True)], y_train)
lr.score(X_hold_scaled[:,sfm_l.get_support(indices=True)], y_hold)
#not even worth trying with that score



-1.0653077386901776e+21

In [114]:
#does ridge work better?
sfm_r = SelectFromModel(ridge_model)
sfm_r.fit(X_train_scaled, y_train)
lr.fit(X_train_scaled[:,sfm_r.get_support(indices=True)], y_train)
lr.score(X_hold_scaled[:,sfm_r.get_support(indices=True)], y_hold)
#much better, we'll try with this too

0.91787341637313113

In [115]:
sfmridge_pred = pd.DataFrame(predictions['Id'], columns=['Id'])
sfmridge_pred['SalePrice'] = lr.predict(X_test_scaled[:,sfm_r.get_support(indices=True)])
sfmridge_pred.head()

Unnamed: 0,Id,SalePrice
0,2658,122377.733818
1,2718,162750.273148
2,2414,217794.229137
3,1989,110959.434417
4,625,173832.510993


In [116]:
sfmridge_pred.to_csv('sfmridge.csv', index=False)