<span style="color:darkblue"><font size="5"> DeCockHousePrice Dataset: Salerice Level Classification </font></span> 

    SVC vs Logistic Regression

In [698]:
import pandas as pd
import seaborn as sns; sns.set(color_codes=True)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
import warnings; warnings.simplefilter('ignore')
import numpy as np
np.random.seed(10)
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

In [699]:
df=pd.read_csv('data/train.csv')

# Data cleaning

In [700]:
cg_cols = ['Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
dc_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
    'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea']
ot_cols = ['Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual',
    'SaleType', 'Utilities']
for col in dc_cols:
    df[col].replace(np.nan, 0, inplace=True)

for col in cg_cols:
    df[col].replace(np.nan, 'None', inplace=True)

for col in ot_cols:
    df[col].replace(np.nan, df[col].mode()[0], inplace=True)
    
# Filling MSZoning according to MSSubClass
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].apply(
    lambda x: x.fillna(x.mode()[0]))

# Filling LotFrontage according to Neighborhood

df['LotFrontage']=df.groupby(['Neighborhood'])['LotFrontage'].apply(lambda x: x.fillna(x.median()))

In [701]:
df.drop('Id',axis=1,inplace=True)

#binning with rare values
col_rare = ['Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
    'Heating', 'Electrical', 'Functional', 'SaleType']

for col in col_rare:
    mask = df[col].isin(
        df[col].value_counts()[df[col].value_counts() < 10].index)
    df[col][mask] = 'Other'

In [702]:
# Features which numerical on data but should be treated as category.
df['MSSubClass'] = df['MSSubClass'].astype(str)

# cyclical feature transformation
from math import pi
df['MoSold']=2*pi*df['MoSold']/df['MoSold'].max()
df["cos_MoSold"] = np.cos(df["MoSold"])
df["sin_MoSold"] = np.sin(df["MoSold"])

In [703]:
neigh_map={'MeadowV': 1,'IDOTRR': 1,
    'BrDale': 1, 'BrkSide': 2,
    'OldTown': 2, 'Edwards': 2,
    'Sawyer': 3,    'Blueste': 3,
    'SWISU': 3,    'NPkVill': 3,
    'NAmes': 3,    'Mitchel': 4,
    'SawyerW': 5,    'NWAmes': 5,
    'Gilbert': 5,    'Blmngtn': 5,
    'CollgCr': 5,    'ClearCr': 6,
    'Crawfor': 6,    'Veenker': 7,
    'Somerst': 7,    'Timber': 8,
    'StoneBr': 9,    'NridgHt': 10,
    'NoRidge': 10}
    
df['Neighborhood'] = df['Neighborhood'].map(neigh_map).astype('int')

ext_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['ExterCond'] = df['ExterCond'].map(ext_map).astype('int')
df['ExterQual'] = df['ExterQual'].map(ext_map).astype('int')

bsm_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['BsmtQual'] = df['BsmtQual'].map(bsm_map).astype('int')
df['BsmtCond'] = df['BsmtCond'].map(bsm_map).astype('int')
bsmf_map = {'None': 0, 'Unf': 1,'LwQ': 2,'Rec': 3, 'BLQ': 4, 'ALQ': 5,'GLQ': 6}

df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmf_map).astype('int')
df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmf_map).astype('int')
heat_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['HeatingQC'] = df['HeatingQC'].map(heat_map).astype('int')
df['KitchenQual'] = df['KitchenQual'].map(heat_map).astype('int')
df['FireplaceQu'] = df['FireplaceQu'].map(bsm_map).astype('int')
df['GarageCond'] = df['GarageCond'].map(bsm_map).astype('int')
df['GarageQual'] = df['GarageQual'].map(bsm_map).astype('int')

In [704]:
# Getting dummy variables for nominal categorical features
df = pd.get_dummies(data=df)

In [705]:
df.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,...,SaleType_COD,SaleType_New,SaleType_Other,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,8450,5,7,5,2003,2003,196.0,4,3,...,0,0,0,1,0,0,0,0,1,0
1,80.0,9600,7,6,8,1976,1976,0.0,3,3,...,0,0,0,1,0,0,0,0,1,0
2,68.0,11250,5,7,5,2001,2002,162.0,4,3,...,0,0,0,1,0,0,0,0,1,0
3,60.0,9550,6,7,5,1915,1970,0.0,3,3,...,0,0,0,1,1,0,0,0,0,0
4,84.0,14260,10,8,5,2000,2000,350.0,4,3,...,0,0,0,1,0,0,0,0,1,0


# Feature Selection 
    - Based on correlation
    
    - Based on P-value

In [706]:
corr = df.corr()

In [707]:
#Remove one of two features that have a correlation higher than 0.9
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]

In [708]:
df = df[selected_columns]

selected_columns

selected_columns=selected_columns.tolist()

selected_columns.remove('SalePrice')

selected_columns=np.array(selected_columns)

In [612]:
import statsmodels.regression.linear_model as sm

In [709]:
X=df.drop('SalePrice',axis=1).values
y=df.loc[:,'SalePrice'].values

In [710]:
numVars = df.shape[1]-1
for i in range(0, numVars):
    regressor_OLS = sm.OLS(y, X).fit()
    maxVar = max(regressor_OLS.pvalues).astype(float)
    if maxVar > 0.05:
        for j in range(0, numVars - i):
            if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                X = np.delete(X, j, 1)
                selected_columns = np.delete(selected_columns, j)

In [711]:
selected_columns.size

119

In [712]:
#Categorize SalePrice into binary variable
df["SalePrice"].median()
df.loc[df['SalePrice'] <= 163000, 'SalePrice'] = 0
df.loc[df['SalePrice'] > 163000, 'SalePrice'] = 1

# Data Scaling

In [713]:
y=df.loc[:,'SalePrice'].values

In [714]:
y[0:10]

array([1, 1, 1, 0, 1, 0, 1, 1, 0, 0])

In [715]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
X = minmax.fit_transform(X)

# SVC model training and evaluation

In [718]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [719]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

In [720]:
pca = PCA()
scv = SVC()
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])
param_grid =[{'pca__n_components': [5, 15, 30, 45, 64],'svc__C': [0.01,0.1,1, 10, 100, 1000],\
              'svc__kernel': ['linear']}, {'pca__n_components': [5, 15, 30, 45, 64],'svc__C': [0.01,0.1,1, 10, 100, 1000], \
                                           'svc__gamma': [100,10,1,0.1,0.01,0.001, 0.001], \
                                           'svc__kernel': ['rbf']},]
search = GridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(X_train_val, y_train_val)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('svc',
                                        SVC(C=0.15, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='linear', max_iter...
             iid='deprecated', n_jobs=-1,
             param_grid

In [721]:
search.best_params_

{'pca__n_components': 45,
 'svc__C': 100,
 'svc__gamma': 0.01,
 'svc__kernel': 'rbf'}

In [734]:
search.best_score_

0.9075345731998092

In [735]:
search.score(X_test,y_test)

0.9178082191780822

# Logistic Regression Model Training and Evaluation

In [725]:
pca = PCA()
lr=LogisticRegression()
pipe_lr = Pipeline(steps=[('pca', pca), ('lr', lr)])
param_grid_lr ={'pca__n_components': [5, 15, 30, 45, 64],'lr__C': [0.01,0.1,1, 10, 100, 100]}
search_lr = GridSearchCV(pipe_lr, param_grid_lr, n_jobs=-1,cv=5)
search_lr.fit(X_train_val, y_train_val)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('lr',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                          

In [731]:
search_lr.best_params_

{'lr__C': 100, 'pca__n_components': 30}

In [732]:
search_lr.best_score_

0.9100950075199004

In [733]:
search_lr.score(X_test,y_test)

0.9178082191780822

# Results and conclusions:

    

In [743]:
def svc_bias_variance(X,y):
    from mlxtend.evaluate import bias_variance_decomp
    pca=PCA(n_components=45)
    X=pca.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,
                random_state=123,shuffle=True,stratify=y)

    model = SVC(C=100,gamma=0.01,kernel='rbf')

    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var)

In [744]:
def logistic_model_bias_variance(X,y):
    from mlxtend.evaluate import bias_variance_decomp
    pca=PCA(n_components=30)
    X=pca.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=123,
                                                    shuffle=True,
                                                    stratify=y)

    model = LogisticRegression(C=100)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        model, X_train, y_train, X_test, y_test, 
        loss='0-1_loss',
        random_seed=123)

    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var)

In [745]:
print('Bias and Variance of SVC model:')
svc_bias_variance(X,y)
print('\n Bias and Variance of LogisticRegression model:')
logistic_model_bias_variance(X,y)

Bias and Variance of SVC model:
Average expected loss: 0.104
Average bias: 0.096
Average variance: 0.049

 Bias and Variance of LogisticRegression model:
Average expected loss: 0.110
Average bias: 0.110
Average variance: 0.039


In [746]:
print(f'Performance of SVC on unseen data: {search.score(X_test,y_test)}' )
print(f'Performance of LogisticRegression on unseen data: {search_lr.score(X_test,y_test)}' )

Performance of SVC on unseen data: 0.9178082191780822
Performance of LogisticRegression on unseen data: 0.9178082191780822


- **SVC model's performance on unseen data is  similar as LogisticResearch**
- **Bias problem still exists**

# solution
- Collect more valuable features
- Manully creat new features from existing features
- Change feature selection strategy
- Try tree model