# Pacakge

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from feature_selector import FeatureSelector

# Data

In [2]:
file_path = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\train_cleaning.csv'
data = pd.read_csv(file_path, encoding="Big5", low_memory=False, index_col=0)

# Featrue Engineering

In [3]:
data["TotalHouse"] = data["TotalBsmtSF"] + data["1stFlrSF"] + data["2ndFlrSF"]
data["TotalArea"] = data["TotalBsmtSF"] + data["1stFlrSF"] + data["2ndFlrSF"] + data["GarageArea"]
data["+_TotalHouse_OverallQual"] = data["TotalHouse"] * data["OverallQual"]
data["-_Functional_TotalHouse"] = data["Functional"] * data["TotalHouse"]
data["-_TotalHouse_LotArea"] = data["TotalHouse"] + data["LotArea"]
data["+_GrLivArea_OverallQual"] = data["GrLivArea"] * data["OverallQual"]
data["+_BsmtFinSF1_OverallQual"] = data["BsmtFinSF1"] * data["OverallQual"]
data["-_Functional_OverallQual"] = data["Functional"] + data["OverallQual"]
data["-_LotArea_OverallQual"] = data["LotArea"] * data["OverallQual"]
data["Bsmt"] = data["BsmtFinSF1"] + data["BsmtFinSF2"] 
data["Rooms"] = data["FullBath"] + data["TotRmsAbvGrd"]
data["TotalPlace"] = data["TotalBsmtSF"] + data["1stFlrSF"] + data["2ndFlrSF"] + data["GarageArea"] \
                           + data["OpenPorchSF"] + data["EnclosedPorch"] + data["3SsnPorch"] + data["ScreenPorch"]
data["PorchArea"] = data["OpenPorchSF"] + data["EnclosedPorch"] + data["3SsnPorch"] + data["ScreenPorch"]

# One-hot Encoding

In [4]:
data = pd.get_dummies(data, drop_first=True)

# Feature Selection

使用ExtraTreesClassifier將重要性等於0的特徵刪除。

In [5]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
reg = ExtraTreesRegressor(random_state=111) 
reg = reg.fit(X_train, y_train)



In [6]:
ls_FeaImpo = list(zip(X_train.columns, reg.feature_importances_))
ls_FeaImpo.sort(reverse=True, key=lambda x: x[1])
df_FeaImpo = pd.DataFrame(ls_FeaImpo, columns = ['feature', 'importance'])

df_FeaImpo

Unnamed: 0,feature,importance
0,+_TotalHouse_OverallQual,2.425728e-01
1,OverallQual,1.348722e-01
2,-_Functional_TotalHouse,1.200648e-01
3,TotalArea,1.052590e-01
4,-_Functional_OverallQual,4.957972e-02
5,TotalPlace,4.492343e-02
6,+_GrLivArea_OverallQual,3.492291e-02
7,FullBath,3.244049e-02
8,KitchenQual,2.493280e-02
9,BsmtQual,1.641674e-02


In [7]:
data.drop(df_FeaImpo.loc[df_FeaImpo['importance'] == 0, 'feature'].values, axis = 1, inplace = True)
df_FeaImpo = df_FeaImpo.loc[df_FeaImpo['importance'] > 0]

使用SelectFromModel時會將重要性小於threshold的特徵刪除。

In [8]:
model = SelectFromModel(reg)
model.fit(X_train, y_train)
model.threshold_

0.004524886877828055

# Collinear Identification

將具高度共線性且特徵重要性遠小於SelectFromModel的threshold的特徵刪除。

In [9]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
fs = FeatureSelector(data = X_train, labels = y_train)
fs.identify_collinear(correlation_threshold = 0.98)
df_collinear = fs.record_collinear

17 features with a correlation magnitude greater than 0.98.



In [11]:
ls_DrpFea = np.unique(np.hstack((df_collinear['drop_feature'].unique(), 
                                 df_collinear['corr_feature'].unique())))
ls_drop = []
for i in ls_DrpFea:
    impo = df_FeaImpo.loc[df_FeaImpo['feature'] == i, 'importance'].values
    #print(i, ': ', impo)
    if impo[0] < 0.004:
        ls_drop.append(i)
data.drop(ls_drop, axis = 1, inplace = True)

# Save Data

In [15]:
file_path = r'C:\Users\user\Desktop\git_hub\finished\ml\kaggle房價預測\data\train_processing.csv'
data.to_csv(file_path)