In [89]:
import numpy
import pandas
import sklearn.feature_selection

In [90]:
df_train = pandas.read_csv('housing_dataset/train.csv')
df_test = pandas.read_csv('housing_dataset/test.csv')

len(df_train), len(df_test.columns)

(1460, 80)

In [91]:
# columns with missing values
missing_vals = df_train.columns[df_train.isnull().any()]

# number of missing values per column
nof_missing = {}
for col in missing_vals:
    nof_missing[col] = df_train[col].isnull().sum()
    print(col, df_train[col].isnull().sum())

LotFrontage 259
Alley 1369
MasVnrType 8
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [92]:
for col in missing_vals:
    if nof_missing[col] > len(df_train) / 10:
        df_train = df_train.drop(col, axis=1)
        df_test = df_test.drop(col, axis=1)
    else:
        most_common = df_train[col].value_counts().index[0]
        df_train[col] = df_train[col].fillna(most_common)
df_train.drop('Id', axis=1, inplace=True)
df_test.drop('Id', axis=1, inplace=True)
len(df_train.columns)

74

In [93]:
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   object 
 4   LotShape       1460 non-null   object 
 5   LandContour    1460 non-null   object 
 6   Utilities      1460 non-null   object 
 7   LotConfig      1460 non-null   object 
 8   LandSlope      1460 non-null   object 
 9   Neighborhood   1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Condition2     1460 non-null   object 
 12  BldgType       1460 non-null   object 
 13  HouseStyle     1460 non-null   object 
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [94]:
numeric_cols = df_train.select_dtypes(include=numpy.number).columns.tolist()
numeric_df = df_train[numeric_cols]
numeric_df.info()
numeric_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   LotArea        1460 non-null   int64  
 2   OverallQual    1460 non-null   int64  
 3   OverallCond    1460 non-null   int64  
 4   YearBuilt      1460 non-null   int64  
 5   YearRemodAdd   1460 non-null   int64  
 6   MasVnrArea     1460 non-null   float64
 7   BsmtFinSF1     1460 non-null   int64  
 8   BsmtFinSF2     1460 non-null   int64  
 9   BsmtUnfSF      1460 non-null   int64  
 10  TotalBsmtSF    1460 non-null   int64  
 11  1stFlrSF       1460 non-null   int64  
 12  2ndFlrSF       1460 non-null   int64  
 13  LowQualFinSF   1460 non-null   int64  
 14  GrLivArea      1460 non-null   int64  
 15  BsmtFullBath   1460 non-null   int64  
 16  BsmtHalfBath   1460 non-null   int64  
 17  FullBath       1460 non-null   int64  
 18  HalfBath

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [8]:
non_numeric_cols = df_train.select_dtypes(include=object).columns.tolist()

non_numeric_df = df_train[non_numeric_cols]
for col in non_numeric_cols:
    di = {k: int(v) for v, k in enumerate(non_numeric_df[col].unique())}
    non_numeric_df[col].replace(di, inplace=True)

non_numeric_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_numeric_df[col].replace(di, inplace=True)


Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,2,0,2,0,0,...,0,0,0,1,1,0,0,0,0,1
4,0,0,1,0,0,1,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,0,0,0,0,0,0,17,0,0,...,0,1,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,6,0,0,...,0,1,1,0,1,0,0,0,0,0
1457,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1458,0,0,0,0,0,0,0,11,0,0,...,2,0,0,0,1,0,0,0,0,0


In [95]:
mutual_information = {k: None for k in numeric_cols}
total_mi = {k: None for k in numeric_cols}
for col in numeric_cols:
    mutual_information[col] = sklearn.feature_selection.mutual_info_regression(
        numeric_df.drop(col, axis=1), numeric_df[col]
    )
    total_mi[col] = mutual_information[col].sum()
total_mi

{'MSSubClass': 6.901135302371131,
 'LotArea': 3.676813721124195,
 'OverallQual': 4.754246482747272,
 'OverallCond': 1.8734801198971378,
 'YearBuilt': 10.874505650824965,
 'YearRemodAdd': 7.4129283108643085,
 'MasVnrArea': 2.2703777620582293,
 'BsmtFinSF1': 3.265464212948933,
 'BsmtFinSF2': 0.43227995156160715,
 'BsmtUnfSF': 5.024558822450808,
 'TotalBsmtSF': 10.207521926520165,
 '1stFlrSF': 10.136011682894608,
 '2ndFlrSF': 4.787831410201152,
 'LowQualFinSF': 0.2773677182955634,
 'GrLivArea': 10.224177154561414,
 'BsmtFullBath': 1.2875366677519455,
 'BsmtHalfBath': 0.24761093154199987,
 'FullBath': 3.330885877307073,
 'HalfBath': 1.6907885632602286,
 'BedroomAbvGr': 2.3363650779053162,
 'KitchenAbvGr': 0.6688728178998478,
 'TotRmsAbvGrd': 3.720808335683245,
 'Fireplaces': 1.6691916370534834,
 'GarageYrBlt': 9.039078566485287,
 'GarageCars': 4.217204103038759,
 'GarageArea': 5.807185625982359,
 'WoodDeckSF': 1.6678845878965793,
 'OpenPorchSF': 2.5279288089844405,
 'EnclosedPorch': 0.7839

In [99]:
# deletion of features with low mutual information
k = 10
while len(total_mi) > k:
    min_col = min(total_mi, key=total_mi.get)
    for col in range(len(mutual_information[min_col])):
        if numeric_cols[col] in total_mi:
            total_mi[numeric_cols[col]] -= mutual_information[min_col][col]
    del total_mi[min_col]

In [98]:
total_mi

{'MSSubClass': 3.784644450313451,
 'YearBuilt': 8.192423381032617,
 'YearRemodAdd': 5.892698830638087,
 'TotalBsmtSF': 7.367785948809152,
 '1stFlrSF': 8.515567317692506,
 '2ndFlrSF': 3.4863577908477037,
 'GrLivArea': 7.692332135624679,
 'GarageYrBlt': 7.128653021070704,
 'GarageArea': 4.769000640958042,
 'SalePrice': 5.567961504690279}