In [20]:
import numpy
import pandas
import sklearn.feature_selection
import sklearn.neural_network
import sklearn.svm
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection

In [21]:
df_train = pandas.read_csv('housing_dataset/train.csv')
df_test = pandas.read_csv('housing_dataset/test.csv')

len(df_train), len(df_test.columns)

(1460, 80)

In [22]:
# columns with missing values
missing_vals = df_train.columns[df_train.isnull().any()]

# number of missing values per column
nof_missing = {}
for col in missing_vals:
    nof_missing[col] = df_train[col].isnull().sum()
    print(col, df_train[col].isnull().sum())

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [23]:
for col in missing_vals:
    if nof_missing[col] > len(df_train) / 10:
        df_train = df_train.drop(col, axis=1)
        df_test = df_test.drop(col, axis=1)
    else:
        most_common = df_train[col].value_counts().index[0]
        df_train[col] = df_train[col].fillna(most_common)
df_train.drop('Id', axis=1, inplace=True)
df_test.drop('Id', axis=1, inplace=True)
len(df_train.columns)

74

In [24]:
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [25]:
numeric_cols = df_train.select_dtypes(include=numpy.number).columns.tolist()
numeric_df = df_train[numeric_cols]
numeric_df.info()
numeric_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotArea        1460 non-null   int64  
 2   OverallQual    1460 non-null   int64  
 3   OverallCond    1460 non-null   int64  
 4   YearBuilt      1460 non-null   int64  
 5   YearRemodAdd   1460 non-null   int64  
 6   MasVnrArea     1460 non-null   float64
 7   BsmtFinSF1     1460 non-null   int64  
 8   BsmtFinSF2     1460 non-null   int64  
 9   BsmtUnfSF      1460 non-null   int64  
 10  TotalBsmtSF    1460 non-null   int64  
 11  1stFlrSF       1460 non-null   int64  
 12  2ndFlrSF       1460 non-null   int64  
 13  LowQualFinSF   1460 non-null   int64  
 14  GrLivArea      1460 non-null   int64  
 15  BsmtFullBath   1460 non-null   int64  
 16  BsmtHalfBath   1460 non-null   int64  
 17  FullBath       1460 non-null   int64  
 18  HalfBath

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [26]:
non_numeric_cols = df_train.select_dtypes(include=object).columns.tolist()

non_numeric_df = df_train[non_numeric_cols]
for col in non_numeric_cols:
    di = {k: int(v) for v, k in enumerate(non_numeric_df[col].unique())}
    non_numeric_df[col].replace(di, inplace=True)

non_numeric_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,2,0,2,0,0,...,0,0,0,1,1,0,0,0,0,1
4,0,0,1,0,0,1,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,0,0,0,0,0,0,17,0,0,...,0,1,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,6,0,0,...,0,1,1,0,1,0,0,0,0,0
1457,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1458,0,0,0,0,0,0,0,11,0,0,...,2,0,0,0,1,0,0,0,0,0


In [27]:
mutual_information = {k: None for k in numeric_cols}
total_mi = {k: None for k in numeric_cols}
for col in numeric_cols:
    mutual_information[col] = sklearn.feature_selection.mutual_info_regression(
        numeric_df.drop(col, axis=1), numeric_df[col]
    )
    total_mi[col] = mutual_information[col].sum()
total_mi

{'MSSubClass': 7.065769438303827,
 'LotArea': 3.6451981600179417,
 'OverallQual': 4.810864978108161,
 'OverallCond': 1.8659070668481248,
 'YearBuilt': 10.80725102944873,
 'YearRemodAdd': 7.488494020761755,
 'MasVnrArea': 2.2862790487122404,
 'BsmtFinSF1': 3.117217759324407,
 'BsmtFinSF2': 0.6084372484121658,
 'BsmtUnfSF': 4.983673554969647,
 'TotalBsmtSF': 10.174486608008433,
 '1stFlrSF': 10.177106830810759,
 '2ndFlrSF': 4.865561969147897,
 'LowQualFinSF': 0.3650814589479694,
 'GrLivArea': 10.203384478769673,
 'BsmtFullBath': 1.2053183957869722,
 'BsmtHalfBath': 0.22735527589496396,
 'FullBath': 3.2256069832600534,
 'HalfBath': 1.6491331267528904,
 'BedroomAbvGr': 2.180210006981585,
 'KitchenAbvGr': 0.6255839662116691,
 'TotRmsAbvGrd': 3.7898962685748097,
 'Fireplaces': 1.7999185497864878,
 'GarageYrBlt': 9.087095179752342,
 'GarageCars': 4.255865404127897,
 'GarageArea': 5.896126024092476,
 'WoodDeckSF': 1.727124525247202,
 'OpenPorchSF': 2.4188738318303216,
 'EnclosedPorch': 0.768180

In [28]:
# deletion of features with low mutual information
k = 10
while len(total_mi) > k:
    min_col = min(total_mi, key=total_mi.get)
    for col in range(len(mutual_information[min_col])):
        if numeric_cols[col] in total_mi:
            total_mi[numeric_cols[col]] -= mutual_information[min_col][col]
    del total_mi[min_col]

In [29]:
total_mi

{'MSSubClass': 4.016799257329102,
 'YearBuilt': 8.149637621145056,
 'YearRemodAdd': 5.924734453573173,
 'TotalBsmtSF': 7.328607990302251,
 '1stFlrSF': 8.51518013140015,
 '2ndFlrSF': 3.5618141832632206,
 'GrLivArea': 7.5348829947913645,
 'GarageYrBlt': 7.108502206371935,
 'GarageArea': 4.955305975993824,
 'SalePrice': 5.586748002840008}

In [30]:
numeric_df['SalePrice']

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [31]:
train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(numeric_df.drop('SalePrice', axis=1), numeric_df['SalePrice'])

# model = sklearn.neural_network.MLPRegressor((250,250, ), max_iter=100000, verbose=True)
# model.fit(numeric_df.drop('SalePrice', axis=1), numeric_df['SalePrice'])
# mse = sklearn.metrics.mean_squared_error(numeric_df['SalePrice'], model.predict(numeric_df.drop('SalePrice', axis=1)))
# mse

model = sklearn.neural_network.MLPRegressor((250,250, ), max_iter=100000, verbose=True)
model.fit(train_data, train_target)
mse = sklearn.metrics.mean_squared_error(test_target, model.predict(test_data))

mse

Iteration 1, loss = 18463623002.24776840
Iteration 2, loss = 16953261764.27986145
Iteration 3, loss = 15198680955.20147705
Iteration 4, loss = 13015628048.43874550
Iteration 5, loss = 10373586137.65361595
Iteration 6, loss = 7672148561.58251572
Iteration 7, loss = 5353327132.97102642
Iteration 8, loss = 4525376472.16787815
Iteration 9, loss = 4384428455.42614555
Iteration 10, loss = 4050067370.33296680
Iteration 11, loss = 3527572624.15244341
Iteration 12, loss = 3155083451.14519024
Iteration 13, loss = 2812086782.12595797
Iteration 14, loss = 2632256187.09586906
Iteration 15, loss = 2359537455.64073277
Iteration 16, loss = 2194301171.16504526
Iteration 17, loss = 2065129322.11086249
Iteration 18, loss = 1989391305.45517302
Iteration 19, loss = 1884627113.28201771
Iteration 20, loss = 1815397035.32697964
Iteration 21, loss = 1744914864.53201222
Iteration 22, loss = 1705982641.84997344
Iteration 23, loss = 1641559098.71101332
Iteration 24, loss = 1600907521.65887213
Iteration 25, loss =

ValueError: y_true and y_pred contain different number of classes 248, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [ 34900  39300  52000  60000  67000  72500  75500  79000  80000  80500
  81000  82000  82500  84500  85000  85400  85500  87000  88000  90350
  91000  93500  95000  96500  99900 100000 103600 106500 107000 107900
 108000 109000 109500 109900 110000 112000 112500 114504 115000 117000
 117500 118000 118500 118964 119000 119500 120000 120500 121600 122000
 122500 122900 124000 124500 125000 125500 127000 127500 129000 129500
 129900 130000 131400 131500 132000 133000 133500 133900 134000 135000
 136500 137000 137500 139000 139400 139600 140000 141000 142500 143000
 143500 144152 145000 145500 146000 146800 147000 148000 148800 149000
 149300 149900 150000 150500 151000 152000 153337 153500 154000 155000
 155835 156500 157000 157500 158000 159500 160000 162000 163000 163500
 165000 167900 168000 168500 169000 169900 170000 171000 171500 173000
 173733 174000 174900 176000 177000 177500 178900 179200 179900 180000
 181000 181900 182900 184000 184100 185000 185900 187100 187500 188700
 189000 190000 191000 192500 193000 193500 195000 196000 196500 197000
 197500 198900 200000 200500 201000 202500 204000 204900 205000 205950
 206000 207500 210000 212000 213000 213250 213490 215000 216500 217000
 220000 224000 224500 224900 225000 226000 227875 228000 228500 228950
 230000 231500 232000 232600 233230 234000 235000 236000 239000 239799
 241500 242000 246578 248000 248328 248900 250000 251000 254000 255900
 256000 258000 260000 262000 262500 263000 266000 266500 270000 271000
 271900 275000 280000 281000 287000 294000 295000 295493 297000 301000
 301500 302000 303477 305000 306000 310000 311500 315000 315500 316600
 318000 320000 325000 326000 333168 335000 336000 340000 345000 348000
 380000 385000 426000 475000 556581 582933 611657 745000]