In [1]:
# Ames house prices predition
# competition link: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [2]:
# reading data

train = pd.read_csv('data/train.csv')
train.set_index('Id',  inplace=True)

test = pd.read_csv('data/test.csv')
test.set_index('Id',  inplace=True)

In [3]:
# combining train and test

ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
data = pd.concat((train, test), sort=False).reset_index(drop=True)
data.drop(['SalePrice'], axis=1, inplace=True)

In [4]:
# setting categoric NA to None or mode 

data["PoolQC"] = data["PoolQC"].fillna("None")
data["MiscFeature"] = data["MiscFeature"].fillna("None")
data["Alley"] = data["Alley"].fillna("None")
data["Fence"] = data["Fence"].fillna("None")
data["FireplaceQu"] = data["FireplaceQu"].fillna("None")
data["MasVnrType"] = data["MasVnrType"].fillna("None")
data['MSSubClass'] = data['MSSubClass'].fillna("None")
data["Functional"] = data["Functional"].fillna("Typ")
data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])
data['KitchenQual'] = data['KitchenQual'].fillna(data['KitchenQual'].mode()[0])
data['Exterior1st'] = data['Exterior1st'].fillna(data['Exterior1st'].mode()[0])
data['Exterior2nd'] = data['Exterior2nd'].fillna(data['Exterior2nd'].mode()[0])
data['SaleType'] = data['SaleType'].fillna(data['SaleType'].mode()[0])
data['MSSubClass'] = data['MSSubClass'].fillna("None")
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    data[col] = data[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    data[col] = data[col].fillna('None')
data["MSZoning"] = data.groupby("Neighborhood")["MSZoning"].transform(
    lambda x: x.fillna(x.mode()[0]))

# setting numeric NA to 0, mode or median

for col in ('GarageArea', 'GarageCars'):
    data[col] = data[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    data[col] = data[col].fillna(0)
data["LotFrontage"] = data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
data["GarageYrBlt"] = data.groupby("Neighborhood")["GarageYrBlt"].transform(
    lambda x: x.fillna(x.mode()[0]))
data["MasVnrArea"] = data["MasVnrArea"].fillna(0)

# dropping feature with only 3 entries different from mode

data = data.drop(['Utilities'], axis=1)

# dropping outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

# dealing with inconsistencies
s = (data['PoolArea']>0) & (data['PoolQC'] == 'None')
for i in (enumerate(s)):
    data.at[i, 'PoolQC'] = 'Fa'
    
# transforming numeric that are actually categoric to categoric

data['MSSubClass'] = data['MSSubClass'].apply(str)

data['OverallCond'] = data['OverallCond'].astype(str)

data['YrSold'] = data['YrSold'].astype(str)

data['MoSold'] = data['MoSold'].astype(str)

In [5]:
# assigning cleaned data back to train and test

pd.options.mode.chained_assignment = None  # default='warn'

train = data[:ntrain]
test = data[ntrain:]

In [9]:
# encoding categoric features to apply  

dummiestrain = pd.get_dummies(train)
#print(train.shape)
#print(dummiestrain.shape)

In [64]:
# RFE

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

X = dummiestrain
Y = y_train

model = LogisticRegression(solver="saga", multi_class="auto")
rfe = RFE(model, 50)
fit = rfe.fit(X, Y)

print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

















Num Features:  50
Selected Features:  [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False  True False  True  True  True False  True  True
  True  True  True  True  True  True  True  True False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False False False
  True False False False  True False False False False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False False False
  True False False False False False False False False False False False
 False False 



In [67]:
fit.ranking_


array([  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,  23,  79,   1,  25,   1,   1,   1,  12,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1, 104, 108, 217, 168,  30, 101, 241,
       202,  64,  42, 116, 199, 115, 182, 126, 215, 111, 194,  14,  41,
       233,   1, 132,   5, 148,  29, 153, 219,   1, 112, 147, 161,   1,
        49,  94, 143, 245,   3,   1, 113, 224, 197, 263, 192, 128, 171,
        68, 135,  75,  92, 154, 196, 139,  40, 209,  95, 133, 110,  74,
       178,  93, 119, 103, 180, 162, 208, 123, 105,  11, 218, 189, 212,
       179, 260, 234, 259, 230,   1, 269, 257, 280, 282, 256,  13, 165,
       125, 146,  87,  63, 200,  20, 225, 211,  32, 150, 109, 267, 235,
       175, 118,  21,  46,  58, 114, 186, 207,   1, 210,  54, 228, 265,
       285,   1, 268, 279, 278, 213, 239, 231, 185, 272, 266, 142, 275,
       127,  51, 277,  55,  85, 261, 174,  33,  52, 169, 184, 252, 222,
       183, 271, 129,  59, 216,  53, 274,  71, 238, 172,  34,  5

In [72]:
# PCA

from sklearn.decomposition import PCA

X = dummiestrain
Y = y_train

pca = PCA(svd_solver="auto")
fit = pca.fit(X)

train_pca = pca.transform(dummiestrain)
print(pca.components_)

[[ 7.53144461e-04  9.99702571e-01  1.48025849e-05 ... -1.25385250e-07
   2.10474609e-07  6.45388168e-07]
 [ 1.17175439e-02 -2.27599725e-02  1.31977945e-03 ...  1.64127719e-06
  -8.81696093e-05  1.05738547e-04]
 [ 2.15522043e-03 -5.54788940e-03 -1.69434137e-04 ...  4.23901649e-06
  -2.37835021e-05  1.24768523e-05]
 ...
 [-0.00000000e+00  4.04987628e-21  5.37764280e-17 ...  9.32503857e-03
   9.32503857e-03  9.32503857e-03]
 [ 0.00000000e+00 -1.95876369e-21 -1.38777879e-17 ... -9.15557315e-04
  -9.15557315e-04 -9.15557315e-04]
 [-0.00000000e+00 -8.32964701e-33 -1.89326621e-29 ... -4.95865355e-15
  -4.95865355e-15 -4.95865355e-15]]


In [74]:
from sklearn.ensemble import ExtraTreesClassifier

X = dummiestrain
Y = y_train

model = ExtraTreesClassifier(n_estimators=100)
model.fit(X, Y)
print(model.feature_importances_)

[1.71708137e-02 1.74350398e-02 1.32860587e-02 1.62388598e-02
 1.67537795e-02 1.12692159e-02 1.60235111e-02 4.24451750e-03
 1.79060549e-02 1.73595755e-02 1.81392935e-02 1.12610527e-02
 7.02325756e-04 1.88596550e-02 1.00304176e-02 2.74334472e-03
 6.37694796e-03 7.33942960e-03 1.08055742e-02 1.19169117e-03
 1.48125844e-02 7.83873580e-03 1.70003659e-02 8.48516765e-03
 1.75849361e-02 1.39271036e-02 1.47301813e-02 4.84670589e-03
 1.02871000e-03 3.97054919e-03 1.92676424e-04 1.55711283e-03
 1.49417609e-03 1.02593792e-03 1.65771882e-04 5.95958156e-04
 6.73125989e-03 1.73655758e-03 1.34805572e-04 4.53020798e-04
 2.85470285e-03 3.58682418e-03 1.44272322e-03 4.03438404e-04
 1.63806657e-03 6.27440059e-04 9.35243751e-04 3.07978843e-04
 1.52279912e-03 5.41675051e-04 4.28561061e-03 3.41200091e-03
 1.28612766e-04 1.98682571e-04 1.21109768e-03 2.07923799e-03
 1.18255833e-03 8.61551460e-03 1.51143862e-03 4.24970366e-04
 8.56307215e-03 1.77852197e-03 1.63658864e-03 1.05933437e-03
 3.49999480e-03 6.817578