<center> 
    <h1> PACKAGE / DATA IMPORTS</h1> 
</center>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.api as sm # with this i can do a quick linear regression
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms

import random
import warnings
warnings.simplefilter(action='ignore')

%matplotlib inline

In [85]:
df = pd.read_csv('./datasets/train.csv')

<center> 
    <h1> DATA CLEANING </h1> 
</center>

In [86]:
df.head()

#reading in my training data and displaying the first 5 rows of my data frame

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [87]:
df.tail()

# displaying the last five rows of my data frame

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000
2050,10,527162130,60,RL,60.0,7500,Pave,,Reg,Lvl,...,0,0,,,,0,6,2010,WD,189000


In [88]:
df.info()

# getting info on my data frame such as shape, data types of each column, and whether or not null values exist

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

In [89]:
def clean_columns(col):
    return col.lower().replace(' ', '_').replace('/', '_')

# built a function to clean my columns by:
# 1) making them lowercase and 2) replacing spaces and forward slashes with underscores

In [90]:
df.columns = [clean_columns(col) for col in df.columns]
df.head(1)

# passed my function in a list comprehension to do it all with one line of code rather than many
# displayed my head to confirm my desired results were achieved

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500


In [91]:
df.columns[df.isnull().any()]

# displaying a list of columns that have null values

Index(['lot_frontage', 'alley', 'mas_vnr_type', 'mas_vnr_area', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'bsmt_full_bath', 'bsmt_half_bath', 'fireplace_qu', 'garage_type',
       'garage_yr_blt', 'garage_finish', 'garage_cars', 'garage_area',
       'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature'],
      dtype='object')

In [92]:
df[df.columns[df.isnull().any()]].isnull().sum()

# displaying only the columns with nulls and their sum

lot_frontage       330
alley             1911
mas_vnr_type        22
mas_vnr_area        22
bsmt_qual           55
bsmt_cond           55
bsmt_exposure       58
bsmtfin_type_1      55
bsmtfin_sf_1         1
bsmtfin_type_2      56
bsmtfin_sf_2         1
bsmt_unf_sf          1
total_bsmt_sf        1
bsmt_full_bath       2
bsmt_half_bath       2
fireplace_qu      1000
garage_type        113
garage_yr_blt      114
garage_finish      114
garage_cars          1
garage_area          1
garage_qual        114
garage_cond        114
pool_qc           2042
fence             1651
misc_feature      1986
dtype: int64

In [93]:
def fill_numeric_and_nonnumeric_na(col):
    if col.dtype == int or col.dtype == float:
        return col.fillna(round(col.mean()), inplace=True)
    else:
        return col.fillna(col.mode()[0], inplace=True)
    
# created a function to fill numeric and nonnumeric nulls with either their mean or mode.

In [94]:
[fill_numeric_and_nonnumeric_na(df[col]) for col in ['lot_frontage', 'mas_vnr_area', 'garage_yr_blt','garage_cars', 'garage_area']];

# passed my function through a list comprehension so i could fill all null columns with one line of code

In [95]:
df[['lot_frontage', 'mas_vnr_area', 'garage_yr_blt', 'garage_cars', 'garage_area']].isnull().sum().sum()
#confirming there are no longer any nulls

0

In [96]:
def make_string_na(col):
        return col.fillna('NA', inplace=True)
    
# created a function to fill np.NaNs with string 'NA' for cases where 'NA' means No (something)

In [97]:
[make_string_na(df[col]) for col in ['alley', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature']];

# passed my function through a list comprehension so i could fill all columns with np.NaNs with one line of code

In [98]:
df[['alley', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature']].isnull().sum().sum()
#confirming there are no longer any nulls

0

In [99]:
df.mas_vnr_type.fillna('None', inplace=True)
# filled all NaNs with 'None'

In [100]:
df.mas_vnr_type.isnull().sum()
#confirming there are no longer any nulls

0

In [101]:
[df[col].fillna(0, inplace=True) for col in ['bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath']];

#created a list comprehension to fill NaNs with 0

In [102]:
df[['bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath']].isna().sum().sum()
#confirming there are no longer any nulls

0

<center> 
    <h1> FEATURE ENGINEERING </h1> 
</center>

In [103]:
p = PolynomialFeatures(degree=3)

In [104]:
df['year_built_sq'] = df.year_built ** 2

In [105]:
df['1st_flr_sf_log'] = np.log(df['1st_flr_sf'])

In [106]:
df['sale_price_bool'] = df.saleprice.map(lambda x: 0 if x <= 162500 else 1)

In [107]:
#df['year_remod_add_sq'] = df.year_built ** 2

In [108]:
df_mapper_1 = {'Reg': 4,
               'IR1':3,
               'IR2':2,
               'IR3':1}

In [109]:
df['lot_shape_scored'] = df.lot_shape.replace(df_mapper_1)
df.lot_shape_scored.value_counts()

4    1295
3     692
2      55
1       9
Name: lot_shape_scored, dtype: int64

In [110]:
df_mapper_3 = {'Ex':5,
               'Gd':4,
               'TA':3,
               'Fa':2,
               'Po':1}

In [111]:
df['exter_cond_scored'] = df.exter_cond.replace(df_mapper_3)
df.exter_cond_scored.value_counts()

3    1778
4     215
2      49
5       7
1       2
Name: exter_cond_scored, dtype: int64

In [112]:
df = pd.get_dummies(df, columns=['ms_zoning'], drop_first=True)
df.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,1st_flr_sf_log,sale_price_bool,lot_shape_scored,exter_cond_scored,ms_zoning_C (all),ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM
0,109,533352170,60,69.0,13517,Pave,,IR1,Lvl,AllPub,...,6.586172,0,3,3,0,0,0,0,1,0


In [113]:
df_mapper_4 = {'Ex':6,
               'Gd':5,
               'TA':4,
               'Fa':3,
               'Po':2,
               'NA':1}

In [114]:
df['bsmt_cond_scored'] = df.bsmt_cond.replace(df_mapper_4)
df.bsmt_cond_scored.value_counts()

4    1834
5      89
3      65
1      55
2       5
6       3
Name: bsmt_cond_scored, dtype: int64

In [115]:
df['kitchen_qual_scored'] = df.kitchen_qual.replace(df_mapper_3)
df.kitchen_qual_scored.value_counts()

3    1047
4     806
5     151
2      47
Name: kitchen_qual_scored, dtype: int64

In [116]:
df = pd.get_dummies(df, columns=['neighborhood'], drop_first=True)
df.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
0,109,533352170,60,69.0,13517,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,1,0,0,0,0,0


In [117]:
df = pd.get_dummies(df, columns=['sale_type'], drop_first=True)
df.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,neighborhood_Timber,neighborhood_Veenker,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,109,533352170,60,69.0,13517,Pave,,IR1,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1


In [118]:
#df_mapper_2 = {'Lvl':4,
#               'Bnk':3,
#               'HLS':2,
#               'Low':1}

In [119]:
#df['land_contour_scored'] = df.land_contour.replace(df_mapper_2)
#df.land_contour_scored.value_counts()

In [120]:
#df['heating_qc_scored'] = df.heating_qc.replace(df_mapper_3)
#df.heating_qc_scored.value_counts()

In [121]:
#df[df.columns[54:]]._get_numeric_data().head(1)

In [144]:
features = ['overall_qual', 'garage_cars', 'bsmt_unf_sf', 'mas_vnr_area', 
                 'kitchen_abvgr', 'fireplaces', 'screen_porch', 'misc_val', 'year_built_sq', 
                 '1st_flr_sf_log', 'sale_price_bool', 'lot_shape_scored', 'exter_cond_scored',
                 '2nd_flr_sf', 'low_qual_fin_sf', 'half_bath', 'bsmt_cond_scored', 'kitchen_qual_scored',
                 'ms_zoning_FV', 'neighborhood_StoneBr', 'neighborhood_NridgHt', 'neighborhood_Timber', 
                 'neighborhood_NoRidge', 'neighborhood_Crawfor', 'neighborhood_BrkSide',
                 'neighborhood_Edwards', 'neighborhood_Somerst', 'sale_type_ConLD']

<center> 
    <h1> MODELING </h1> 
</center>

In [145]:
features_poly = p.fit_transform(df[features])

In [146]:
poly_df = pd.DataFrame(features_poly, columns=p.get_feature_names())

In [194]:
poly_df

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x25^3,x25^2 x26,x25^2 x27,x25 x26^2,x25 x26 x27,x25 x27^2,x26^3,x26^2 x27,x26 x27^2,x27^3
0,1.0,6.0,2.0,192.0,289.0,1.0,0.0,0.0,0.0,3904576.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,7.0,2.0,276.0,132.0,1.0,1.0,0.0,0.0,3984016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,5.0,1.0,326.0,0.0,1.0,0.0,0.0,0.0,3814209.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,5.0,2.0,384.0,0.0,1.0,0.0,0.0,0.0,4024036.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,6.0,2.0,676.0,0.0,1.0,0.0,0.0,0.0,3610000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,6.0,2.0,557.0,0.0,1.0,1.0,0.0,0.0,3865156.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,6.0,2.0,0.0,82.0,1.0,0.0,0.0,0.0,4020025.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,5.0,2.0,188.0,180.0,1.0,0.0,0.0,0.0,3837681.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,5.0,1.0,632.0,0.0,1.0,0.0,0.0,400.0,3810304.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,5.0,2.0,390.0,192.0,1.0,2.0,0.0,0.0,3876961.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
#drop column 1, replace the column names

In [148]:
X_train, X_test, y_train, y_test = train_test_split(df_poly, df.saleprice, random_state=28)

In [149]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [150]:
print(X_train.shape)
print(X_test.shape)

(1538, 4960)
(513, 4960)


In [151]:
lr = LinearRegression()
lasso = LassoCV()
ridge = RidgeCV()

In [152]:
#Ridge CV -- NEW -- good (1st_flr_sf, garage_area, wood_deck_sf, pool_area)
np.sqrt(np.abs(cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error').mean()))

43581.428901656895

In [153]:
#Lasso CV -- NEW
np.sqrt(np.abs(cross_val_score(lasso, X_train, y_train, scoring='neg_mean_squared_error').mean()))

26369.32497786479

In [154]:
#LinearRegression CV -- NEW
np.sqrt(np.abs(cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error').mean()))

1186172692844963.5

In [155]:
#lr.fit(X_train, y_train)

In [156]:
#lr.score(X_test, y_test)

In [157]:
ridge.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [158]:
ridge.score(X_test, y_test)

0.8383041730379385

In [159]:
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [160]:
lasso.score(X_test, y_test)

0.8639608871764585

<center> 
    <h1> TEST DATA </h1> 
</center>

In [161]:
df_2 = pd.read_csv('./datasets/test.csv')
df_2.head(1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD


In [162]:
df_2.columns = [clean_columns(col) for col in df_2.columns]
df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD


In [163]:
df_2[df_2.columns[df_2.isnull().any()]].isnull().sum()

lot_frontage      160
alley             821
mas_vnr_type        1
mas_vnr_area        1
bsmt_qual          25
bsmt_cond          25
bsmt_exposure      25
bsmtfin_type_1     25
bsmtfin_type_2     25
electrical          1
fireplace_qu      422
garage_type        44
garage_yr_blt      45
garage_finish      45
garage_qual        45
garage_cond        45
pool_qc           875
fence             707
misc_feature      838
dtype: int64

In [164]:
[fill_numeric_and_nonnumeric_na(df_2[col]) for col in ['lot_frontage', 'mas_vnr_area', 'garage_yr_blt']];

# passed my function through a list comprehension so i could fill all null columns with one line of code

In [165]:
[make_string_na(df_2[col]) for col in ['alley', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual', 'garage_cond', 'pool_qc', 'fence', 'misc_feature']];

# passed my function through a list comprehension so i could fill all columns with np.NaNs with one line of code

In [166]:
df_2.mas_vnr_type.fillna('None', inplace=True)
# filled all NaNs with 'None'

In [167]:
df_2['year_built_sq'] = df_2.year_built ** 2

In [168]:
df_2['1st_flr_sf_log'] = np.log(df_2['1st_flr_sf'])

In [169]:
df_2['sale_price_bool'] = df.saleprice.map(lambda x: 0 if x <= 162500 else 1)

In [170]:
df_2_mapper_1 = {'Reg': 4,
               'IR1':3,
               'IR2':2,
               'IR3':1}

In [171]:
df_2['lot_shape_scored'] = df_2.lot_shape.replace(df_2_mapper_1)
df_2.lot_shape_scored.value_counts()

4    564
3    287
2     21
1      7
Name: lot_shape_scored, dtype: int64

In [172]:
df_2_mapper_3 = {'Ex':5,
               'Gd':4,
               'TA':3,
               'Fa':2,
               'Po':1}

In [173]:
df_2['exter_cond_scored'] = df_2.exter_cond.replace(df_2_mapper_3)
df_2.exter_cond_scored.value_counts()

3    771
4     84
2     18
5      5
1      1
Name: exter_cond_scored, dtype: int64

In [174]:
df_2 = pd.get_dummies(df_2, columns=['ms_zoning'], drop_first=True)
df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,year_built_sq,1st_flr_sf_log,sale_price_bool,lot_shape_scored,exter_cond_scored,ms_zoning_FV,ms_zoning_I (all),ms_zoning_RH,ms_zoning_RL,ms_zoning_RM
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,3648100,6.811244,0,4,2,0,0,0,0,1


In [175]:
df_2_mapper_4 = {'Ex':6,
               'Gd':5,
               'TA':4,
               'Fa':3,
               'Po':2,
               'NA':1}

In [176]:
df_2['bsmt_cond_scored'] = df_2.bsmt_cond.replace(df_2_mapper_4)
df_2.bsmt_cond_scored.value_counts()

4    782
3     39
5     33
1     25
Name: bsmt_cond_scored, dtype: int64

In [177]:
df_2['kitchen_qual_scored'] = df_2.kitchen_qual.replace(df_2_mapper_3)
df_2.kitchen_qual_scored.value_counts()

3    447
4    354
5     54
2     23
1      1
Name: kitchen_qual_scored, dtype: int64

In [178]:
df_2 = pd.get_dummies(df_2, columns=['neighborhood'])
df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,1,0,0,0,0,0,0,0


In [179]:
df_2 = pd.get_dummies(df_2, columns=['sale_type'])
df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1


In [180]:
#df_2 = pd.get_dummies(df_2, columns=['exterior_1st'])
#df_2.head()

In [181]:
#df_2_mapper_2 = {'Lvl':4,
#               'Bnk':3,
#               'HLS':2,
#               'Low':1}

In [182]:
#df_2['land_contour_scored'] = df_2.land_contour.replace(df_2_mapper_2)
#df_2.land_contour_scored.value_counts()

In [183]:
features_poly_df_2 = p.fit_transform(df_2[features])

In [195]:
features_poly_df_2

array([[1., 6., 1., ..., 0., 0., 0.],
       [1., 5., 2., ..., 0., 0., 0.],
       [1., 7., 2., ..., 0., 0., 0.],
       ...,
       [1., 5., 1., ..., 0., 0., 0.],
       [1., 4., 2., ..., 0., 0., 0.],
       [1., 5., 1., ..., 0., 0., 0.]])

In [184]:
poly_df_2 = pd.DataFrame(features_poly_df_2, columns=p.get_feature_names())

In [196]:
poly_df_2

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x25^3,x25^2 x26,x25^2 x27,x25 x26^2,x25 x26 x27,x25 x27^2,x26^3,x26^2 x27,x26 x27^2,x27^3
0,1.0,6.0,1.0,1020.0,0.0,2.0,0.0,0.0,0.0,3648100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,5.0,2.0,1967.0,0.0,2.0,0.0,0.0,0.0,3908529.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,7.0,2.0,100.0,0.0,1.0,1.0,0.0,0.0,4024036.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,5.0,2.0,968.0,0.0,1.0,0.0,0.0,0.0,3697929.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,6.0,2.0,785.0,247.0,1.0,2.0,185.0,0.0,3853369.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,4.0,1.0,252.0,0.0,1.0,0.0,0.0,0.0,3888784.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,4.0,1.0,869.0,0.0,1.0,0.0,0.0,0.0,3833764.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,5.0,1.0,1072.0,0.0,1.0,0.0,0.0,0.0,3908529.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,7.0,2.0,840.0,0.0,1.0,0.0,0.0,0.0,4016016.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,1.0,6.0,2.0,276.0,0.0,1.0,2.0,0.0,0.0,3908529.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,sale_type_COD,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,0,0,0,0,0,0,0,1


In [190]:
poly_df_2.head(1)

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x25^3,x25^2 x26,x25^2 x27,x25 x26^2,x25 x26 x27,x25 x27^2,x26^3,x26^2 x27,x26 x27^2,x27^3
0,1.0,6.0,1.0,1020.0,0.0,2.0,0.0,0.0,0.0,3648100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [207]:
newest_df_2 = df_2.join(poly_df_2)

In [208]:
newest_df_2.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,...,x25^3,x25^2 x26,x25^2 x27,x25 x26^2,x25 x26 x27,x25 x27^2,x26^3,x26^2 x27,x26 x27^2,x27^3
0,2658,902301120,190,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [200]:
X_holdout = newest_df_2[features]

In [206]:
newest_df_2[features].head(1)

Unnamed: 0,overall_qual,garage_cars,bsmt_unf_sf,mas_vnr_area,kitchen_abvgr,fireplaces,screen_porch,misc_val,year_built_sq,1st_flr_sf_log,...,ms_zoning_FV,neighborhood_StoneBr,neighborhood_NridgHt,neighborhood_Timber,neighborhood_NoRidge,neighborhood_Crawfor,neighborhood_BrkSide,neighborhood_Edwards,neighborhood_Somerst,sale_type_ConLD
0,6,1,1020,0.0,2,0,0,0,3648100,6.811244,...,0,0,0,0,0,0,0,0,0,0


In [209]:
y_pred = lasso.predict(X_holdout)

ValueError: shapes (879,28) and (4960,) not aligned: 28 (dim 1) != 4960 (dim 0)

In [None]:
my_ids = df_2['id']

In [None]:
#df = pd.DataFrame()

In [None]:
df['id'] = my_ids

In [None]:
df['saleprice'] = y_pred

In [None]:
df.set_index('id', inplace=True)

In [None]:
df.to_csv('./datasets/britt_7th_submission.csv')