In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from time import time

In [2]:
data = pd.read_csv('../input/housing-prices-regression-dataset/train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
X = data.iloc[:,0:]
X.iloc[:,-1].dtype

dtype('int64')

In [5]:
bad_cols = []
good_cols = []
for col in X:
    if X[col].dtype == 'object':
        bad_cols.append(col)
    else:
        good_cols.append(col)

In [6]:
good_X = X[good_cols]
good_X

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,366,0,112,0,0,0,0,4,2010,142125


In [7]:
bad_X = X[bad_cols]
for i in bad_X:
    print()
    print(f"{bad_X[i].name} --> {bad_X[i].unique()}")
    
    


MSZoning --> ['RL' 'RM' 'C (all)' 'FV' 'RH']

Street --> ['Pave' 'Grvl']

Alley --> [nan 'Grvl' 'Pave']

LotShape --> ['Reg' 'IR1' 'IR2' 'IR3']

LandContour --> ['Lvl' 'Bnk' 'Low' 'HLS']

Utilities --> ['AllPub' 'NoSeWa']

LotConfig --> ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

LandSlope --> ['Gtl' 'Mod' 'Sev']

Neighborhood --> ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Condition1 --> ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']

Condition2 --> ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']

BldgType --> ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']

HouseStyle --> ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']

RoofStyle --> ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']

RoofMatl --> ['CompShg' 'WdShngl' 'Metal' 'WdSha

In [8]:
def yes_no(data):
    for i in data:
        ls = data[i].unique()
        if len(ls) == 2:
            data[i].replace({ls[0]:1, ls[1]:0}, inplace=True)
            good_X[i] = data[i].values
            data.drop(i, inplace=True, axis=1)

In [9]:
yes_no(bad_X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [10]:
for i in bad_X:
    print()
    print(f"{bad_X[i].name} --> {bad_X[i].unique()}")


MSZoning --> ['RL' 'RM' 'C (all)' 'FV' 'RH']

Alley --> [nan 'Grvl' 'Pave']

LotShape --> ['Reg' 'IR1' 'IR2' 'IR3']

LandContour --> ['Lvl' 'Bnk' 'Low' 'HLS']

LotConfig --> ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

LandSlope --> ['Gtl' 'Mod' 'Sev']

Neighborhood --> ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Condition1 --> ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']

Condition2 --> ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']

BldgType --> ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']

HouseStyle --> ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']

RoofStyle --> ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']

RoofMatl --> ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']

Exterior1st --> ['V

In [11]:
bad_X.shape, good_X.shape

((1460, 40), (1460, 41))

In [12]:
bad_cols = []
for i in bad_X:
    bad_cols.append(i)
len(bad_cols)

40

In [13]:
# Converting to Hot Encoding Vector form
bad_X_conv = pd.get_dummies(data=bad_X, columns = bad_cols)
for col in bad_X_conv:
    print(f"{col} --> {bad_X_conv[col].unique()}")

MSZoning_C (all) --> [0 1]
MSZoning_FV --> [0 1]
MSZoning_RH --> [0 1]
MSZoning_RL --> [1 0]
MSZoning_RM --> [0 1]
Alley_Grvl --> [0 1]
Alley_Pave --> [0 1]
LotShape_IR1 --> [0 1]
LotShape_IR2 --> [0 1]
LotShape_IR3 --> [0 1]
LotShape_Reg --> [1 0]
LandContour_Bnk --> [0 1]
LandContour_HLS --> [0 1]
LandContour_Low --> [0 1]
LandContour_Lvl --> [1 0]
LotConfig_Corner --> [0 1]
LotConfig_CulDSac --> [0 1]
LotConfig_FR2 --> [0 1]
LotConfig_FR3 --> [0 1]
LotConfig_Inside --> [1 0]
LandSlope_Gtl --> [1 0]
LandSlope_Mod --> [0 1]
LandSlope_Sev --> [0 1]
Neighborhood_Blmngtn --> [0 1]
Neighborhood_Blueste --> [0 1]
Neighborhood_BrDale --> [0 1]
Neighborhood_BrkSide --> [0 1]
Neighborhood_ClearCr --> [0 1]
Neighborhood_CollgCr --> [1 0]
Neighborhood_Crawfor --> [0 1]
Neighborhood_Edwards --> [0 1]
Neighborhood_Gilbert --> [0 1]
Neighborhood_IDOTRR --> [0 1]
Neighborhood_MeadowV --> [0 1]
Neighborhood_Mitchel --> [0 1]
Neighborhood_NAmes --> [0 1]
Neighborhood_NPkVill --> [0 1]
Neighborhood_NW

In [14]:
bad_X_conv.shape

(1460, 246)

In [15]:
for col in bad_X_conv:
    good_X[col] = bad_X_conv[col].values
    
# good_X[y] = y.values    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  


In [16]:
print(good_X.shape)
good_X = good_X.fillna(good_X.mean())
print(good_X.shape)

(1460, 287)
(1460, 287)


In [17]:
dataset = good_X.iloc[:,good_X.columns != 'SalePrice']
target = good_X['SalePrice']

In [18]:
dataset.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [19]:
select = SelectKBest(score_func=f_regression, k=100)
z = select.fit_transform(dataset,target)
scores = select.scores_ 
# print("After selecting best 3 features:", z.shape) 

In [20]:
z.shape

(1460, 100)

In [21]:
df = pd.DataFrame(z)

In [22]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,65.0,8450.0,7.0,2003.0,2003.0,196.0,706.0,150.0,856.0,856.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,80.0,9600.0,6.0,1976.0,1976.0,0.0,978.0,284.0,1262.0,1262.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,68.0,11250.0,7.0,2001.0,2002.0,162.0,486.0,434.0,920.0,920.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,60.0,9550.0,7.0,1915.0,1970.0,0.0,216.0,540.0,756.0,961.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,84.0,14260.0,8.0,2000.0,2000.0,350.0,655.0,490.0,1145.0,1145.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [23]:
score_dict = {}

In [24]:
for i in range(len(scores)):
    score_dict[dataset.columns[i]] = scores[i]
    
sorted_score_dict = {k: v for k, v in sorted(score_dict.items(), key=lambda item: item[1], reverse=True) if v != 'nan'}   

In [25]:
sorted_score_dict

{'OverallQual': 2436.770590906353,
 'GrLivArea': 1470.5850099552902,
 'GarageCars': 1013.7056661608025,
 'GarageArea': 926.9512872732649,
 'TotalBsmtSF': 880.3412823895799,
 '1stFlrSF': 845.5244877365998,
 'ExterQual_TA': 774.6770189635032,
 'FullBath': 668.4302964873186,
 'BsmtQual_Ex': 642.6374537766901,
 'TotRmsAbvGrd': 580.7628008190745,
 'YearBuilt': 548.6658210637398,
 'KitchenQual_TA': 538.3585237788421,
 'YearRemodAdd': 504.71485472492975,
 'KitchenQual_Ex': 496.7129583676957,
 'Foundation_PConc': 480.15681502334473,
 'MasVnrArea': 425.36642219694966,
 'GarageYrBlt': 413.79089444321215,
 'Fireplaces': 406.50386609775416,
 'ExterQual_Gd': 375.32946995308623,
 'BsmtQual_TA': 375.17805159215715,
 'ExterQual_Ex': 372.6216740273885,
 'BsmtFinType1_GLQ': 339.50314367863825,
 'HeatingQC_Ex': 339.39893330871195,
 'GarageFinish_Fin': 311.6958090862427,
 'GarageFinish_Unf': 295.6667925076009,
 'Neighborhood_NridgHt': 281.282911221435,
 'BsmtFinSF1': 255.9235165142854,
 'MasVnrType_None':

In [26]:
features_total = [k for k,v in sorted_score_dict.items()]

In [27]:
features_final = features_total[:100]

In [28]:
def data_clean(data):
    X = data.copy()
    for col in X:
        X[col] = (X[col]-X[col].min())/(X[col].max()-X[col].min())
        
    return X    

In [29]:
test_data = pd.read_csv('../input/housing-prices-regression-dataset/test.csv')
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [30]:
X_pr = test_data.iloc[:,0:]
X_pr.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [31]:
bad_cols_pred = []
good_cols_pred = []
for col in X_pr:
    if X_pr[col].dtype == 'object':
        bad_cols_pred.append(col)
    else:
        good_cols_pred.append(col)
        
len(bad_cols_pred), len(good_cols_pred)        

(43, 37)

In [32]:
good_X_pred = X_pr[good_cols_pred]
good_X_pred


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,506.0,0,82,0,0,144,0,0,1,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,21.0,1936,4,7,1970,1970,0.0,0.0,...,0.0,0,0,0,0,0,0,0,6,2006
1455,2916,160,21.0,1894,4,5,1970,1970,0.0,252.0,...,286.0,0,24,0,0,0,0,0,4,2006
1456,2917,20,160.0,20000,5,7,1960,1996,0.0,1224.0,...,576.0,474,0,0,0,0,0,0,9,2006
1457,2918,85,62.0,10441,5,5,1992,1992,0.0,337.0,...,0.0,80,32,0,0,0,0,700,7,2006


In [33]:
bad_X_pred = X_pr[bad_cols_pred]
for i in bad_X_pred:
    print(f"{bad_X_pred[i].name} --> {bad_X_pred[i].unique()}")
    
    

MSZoning --> ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]
Street --> ['Pave' 'Grvl']
Alley --> [nan 'Pave' 'Grvl']
LotShape --> ['Reg' 'IR1' 'IR2' 'IR3']
LandContour --> ['Lvl' 'HLS' 'Bnk' 'Low']
Utilities --> ['AllPub' nan]
LotConfig --> ['Inside' 'Corner' 'FR2' 'CulDSac' 'FR3']
LandSlope --> ['Gtl' 'Mod' 'Sev']
Neighborhood --> ['NAmes' 'Gilbert' 'StoneBr' 'BrDale' 'NPkVill' 'NridgHt' 'Blmngtn'
 'NoRidge' 'Somerst' 'SawyerW' 'Sawyer' 'NWAmes' 'OldTown' 'BrkSide'
 'ClearCr' 'SWISU' 'Edwards' 'CollgCr' 'Crawfor' 'Blueste' 'IDOTRR'
 'Mitchel' 'Timber' 'MeadowV' 'Veenker']
Condition1 --> ['Feedr' 'Norm' 'PosN' 'RRNe' 'Artery' 'RRNn' 'PosA' 'RRAn' 'RRAe']
Condition2 --> ['Norm' 'Feedr' 'PosA' 'PosN' 'Artery']
BldgType --> ['1Fam' 'TwnhsE' 'Twnhs' 'Duplex' '2fmCon']
HouseStyle --> ['1Story' '2Story' 'SLvl' '1.5Fin' 'SFoyer' '2.5Unf' '1.5Unf']
RoofStyle --> ['Gable' 'Hip' 'Gambrel' 'Flat' 'Mansard' 'Shed']
RoofMatl --> ['CompShg' 'Tar&Grv' 'WdShake' 'WdShngl']
Exterior1st --> ['VinylSd' 'Wd Sdng' 'H

In [34]:
def yes_no(data):
    for i in data:
        ls = data[i].unique()
        if len(ls) == 2:
            data[i].replace({ls[0]:1, ls[1]:0}, inplace=True)
            good_X_pred[i] = data[i].values
            data.drop(i, inplace=True, axis=1)

In [35]:
yes_no(bad_X_pred)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [36]:
for i in bad_X_pred:
    print()
    print(f"{bad_X_pred[i].name} --> {bad_X_pred[i].unique()}")    


MSZoning --> ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]

Alley --> [nan 'Pave' 'Grvl']

LotShape --> ['Reg' 'IR1' 'IR2' 'IR3']

LandContour --> ['Lvl' 'HLS' 'Bnk' 'Low']

LotConfig --> ['Inside' 'Corner' 'FR2' 'CulDSac' 'FR3']

LandSlope --> ['Gtl' 'Mod' 'Sev']

Neighborhood --> ['NAmes' 'Gilbert' 'StoneBr' 'BrDale' 'NPkVill' 'NridgHt' 'Blmngtn'
 'NoRidge' 'Somerst' 'SawyerW' 'Sawyer' 'NWAmes' 'OldTown' 'BrkSide'
 'ClearCr' 'SWISU' 'Edwards' 'CollgCr' 'Crawfor' 'Blueste' 'IDOTRR'
 'Mitchel' 'Timber' 'MeadowV' 'Veenker']

Condition1 --> ['Feedr' 'Norm' 'PosN' 'RRNe' 'Artery' 'RRNn' 'PosA' 'RRAn' 'RRAe']

Condition2 --> ['Norm' 'Feedr' 'PosA' 'PosN' 'Artery']

BldgType --> ['1Fam' 'TwnhsE' 'Twnhs' 'Duplex' '2fmCon']

HouseStyle --> ['1Story' '2Story' 'SLvl' '1.5Fin' 'SFoyer' '2.5Unf' '1.5Unf']

RoofStyle --> ['Gable' 'Hip' 'Gambrel' 'Flat' 'Mansard' 'Shed']

RoofMatl --> ['CompShg' 'Tar&Grv' 'WdShake' 'WdShngl']

Exterior1st --> ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'CemntBd' 'WdS

In [37]:
bad_X_pred

Unnamed: 0,MSZoning,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RH,,Reg,Lvl,Inside,Gtl,NAmes,Feedr,Norm,1Fam,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,RL,,IR1,Lvl,Corner,Gtl,NAmes,Norm,Norm,1Fam,...,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,RL,,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,1Fam,...,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,RL,,IR1,Lvl,Inside,Gtl,Gilbert,Norm,Norm,1Fam,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,RL,,IR1,HLS,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,RM,,Reg,Lvl,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,...,,,,,Y,,,,WD,Normal
1455,RM,,Reg,Lvl,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,...,CarPort,Unf,TA,TA,Y,,,,WD,Abnorml
1456,RL,,Reg,Lvl,Inside,Gtl,Mitchel,Norm,Norm,1Fam,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
1457,RL,,Reg,Lvl,Inside,Gtl,Mitchel,Norm,Norm,1Fam,...,,,,,Y,,MnPrv,Shed,WD,Normal


In [38]:
bad_cols_pred = []
for i in bad_X_pred:
    bad_cols_pred.append(i)
len(bad_cols_pred)  

40

In [39]:
# Converting to Hot Encoding Vector form
bad_X_conv_pred = pd.get_dummies(data=bad_X_pred, columns = bad_cols_pred)
for col in bad_X_conv_pred:
    print(f"{col} --> {bad_X_conv_pred[col].unique()}")

MSZoning_C (all) --> [0 1]
MSZoning_FV --> [0 1]
MSZoning_RH --> [1 0]
MSZoning_RL --> [0 1]
MSZoning_RM --> [0 1]
Alley_Grvl --> [0 1]
Alley_Pave --> [0 1]
LotShape_IR1 --> [0 1]
LotShape_IR2 --> [0 1]
LotShape_IR3 --> [0 1]
LotShape_Reg --> [1 0]
LandContour_Bnk --> [0 1]
LandContour_HLS --> [0 1]
LandContour_Low --> [0 1]
LandContour_Lvl --> [1 0]
LotConfig_Corner --> [0 1]
LotConfig_CulDSac --> [0 1]
LotConfig_FR2 --> [0 1]
LotConfig_FR3 --> [0 1]
LotConfig_Inside --> [1 0]
LandSlope_Gtl --> [1 0]
LandSlope_Mod --> [0 1]
LandSlope_Sev --> [0 1]
Neighborhood_Blmngtn --> [0 1]
Neighborhood_Blueste --> [0 1]
Neighborhood_BrDale --> [0 1]
Neighborhood_BrkSide --> [0 1]
Neighborhood_ClearCr --> [0 1]
Neighborhood_CollgCr --> [0 1]
Neighborhood_Crawfor --> [0 1]
Neighborhood_Edwards --> [0 1]
Neighborhood_Gilbert --> [0 1]
Neighborhood_IDOTRR --> [0 1]
Neighborhood_MeadowV --> [0 1]
Neighborhood_Mitchel --> [0 1]
Neighborhood_NAmes --> [1 0]
Neighborhood_NPkVill --> [0 1]
Neighborhood_NW

In [40]:
for col in bad_X_conv_pred:
    good_X_pred[col] = bad_X_conv_pred[col].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  


In [41]:
print(good_X_pred.shape)
good_X_pred = good_X_pred.fillna(good_X_pred.mean())
print(good_X_pred.shape)

(1459, 269)
(1459, 269)


In [42]:
def feature_scaler(features, X):
    for feature in features:
        X[feature] = X[feature]*sorted_score_dict[feature]

In [43]:
features_final_pred = []
for feature in features_total:
    if feature in good_X_pred:
        features_final_pred.append(feature)
    if len(features_final_pred) == 100:
        break

In [44]:
(features_final_pred)

['OverallQual',
 'GrLivArea',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'ExterQual_TA',
 'FullBath',
 'BsmtQual_Ex',
 'TotRmsAbvGrd',
 'YearBuilt',
 'KitchenQual_TA',
 'YearRemodAdd',
 'KitchenQual_Ex',
 'Foundation_PConc',
 'MasVnrArea',
 'GarageYrBlt',
 'Fireplaces',
 'ExterQual_Gd',
 'BsmtQual_TA',
 'ExterQual_Ex',
 'BsmtFinType1_GLQ',
 'HeatingQC_Ex',
 'GarageFinish_Fin',
 'GarageFinish_Unf',
 'Neighborhood_NridgHt',
 'BsmtFinSF1',
 'MasVnrType_None',
 'SaleType_New',
 'GarageType_Detchd',
 'SaleCondition_Partial',
 'Foundation_CBlock',
 'FireplaceQu_Gd',
 'GarageType_Attchd',
 'LotFrontage',
 'MasVnrType_Stone',
 'Neighborhood_NoRidge',
 'WoodDeckSF',
 'KitchenQual_Gd',
 '2ndFlrSF',
 'OpenPorchSF',
 'HeatingQC_TA',
 'BsmtExposure_Gd',
 'Exterior2nd_VinylSd',
 'Exterior1st_VinylSd',
 'MSZoning_RM',
 'HalfBath',
 'GarageCond_TA',
 'LotShape_Reg',
 'LotArea',
 'BsmtExposure_No',
 'FireplaceQu_Ex',
 'CentralAir',
 'GarageQual_TA',
 'MSZoning_RL',
 'HouseStyle_2Story'

In [45]:
# features_final_pred = [feature for feature in features_total if feature in good_X_pred]
data_final = good_X[features_final]
data_final_pred = good_X_pred[features_final_pred]
y = target
X = data_clean(data_final)
X_pred = data_clean(data_final_pred)
feature_scaler(features_final, X)
feature_scaler(features_final_pred, X_pred)

In [46]:
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)

In [47]:
# X_train

In [48]:
# model = keras.Sequential([
#     keras.layers.Dense(10,input_shape=(40,), activation='relu'),
# #     keras.layers.Dropout(0.5),
# #     keras.layers.Dense(20, activation='sigmoid'),
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(20, activation='relu'),    
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(20, activation='relu'),
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(20, activation='relu'),
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(20, activation='relu'),
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(1, activation='relu'),
# ])

# model.compile(optimizer='Adam',
#              loss = 'mean_absolute_error',
#              metrics= ['accuracy'])

# model.fit(X_train, y_train, epochs=100)

In [49]:
# model.evaluate(x_test,y_test)

In [50]:
from sklearn.ensemble import RandomForestRegressor

In [51]:
clf = RandomForestRegressor(max_depth=5, random_state=0)
clf.fit(X, y)

RandomForestRegressor(max_depth=5, random_state=0)

In [52]:
yp = clf.predict(X_pred)

In [53]:
prediction = yp
my_submission = pd.DataFrame({'Id': good_X_pred.Id, 'SalePrice': prediction})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [54]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
