In [349]:
#!pip3 install statsmodels

Let's see how Logistic Regression acts with 5 techniques:
1. Standardization of Numerical Variables
2. Encoding of Categorical Variables
3. Data Imbalance
4. Colinearity
5. Missing Values

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from snape.make_dataset import make_dataset

Check out snape [here](https://github.com/mbernico/snape)

In [71]:
def get_data(categorical_features=True,
             balanced=True, 
             correlated_features=False, 
             missing_values=False,
             dataset_size=12000):
    
    if categorical_features:
        label_list = []
        N_CATEGORICAL = 4
        for i in range(N_CATEGORICAL):
            num_classes = np.random.randint(2, 10)
            labels = list(np.arange(num_classes))
            labels = [f'str_{i}' for i in labels]
            label_list.append(labels)
            
    if correlated_features:
        N_REDUNDANT = 1
        N_REPEATED = 1
        N_INFORMATIVE = 8 - N_REDUNDANT - N_REPEATED
    
    conf = {
        "type": "classification",
        "n_classes": 2,
        "n_samples": dataset_size,
        "n_features": 8,
        "out_path": "./",
        "output": "my_dataset",
        "n_informative": N_INFORMATIVE if correlated_features else 8,
        "n_repeated": N_REPEATED if correlated_features else 0,
        "n_redundant": N_REDUNDANT if correlated_features else 0,
        "n_clusters": 2,
        "weights": [0.5, 0.5] if balanced else [0.9, 0.1],
        "pct_missing": 0.70 if missing_values else 0.00,
        "n_categorical": N_CATEGORICAL if categorical_features else 0,
        "random_seed":42,
        "label_list":label_list if categorical_features else []
    }

    make_dataset(config=conf)
    df = pd.read_csv('my_dataset_train.csv')
    
    label = 'y'
    categorical_features = [col for col in df.columns if (df[col].dtype==object) & (col != label)]
    numerical_features = [col for col in df.columns if (col not in categorical_features) & (col != label)]
    
    return df, label, categorical_features, numerical_features

def evaluation(pipeline, X, y):
    y_predict_proba = pipeline.predict_proba(X)[:, 1]
    return{
        'auc': roc_auc_score(y, y_predict_proba),
        'pr-auc': average_precision_score(y, y_predict_proba)
    }

# Logistic Regression

## 1.1 Standardiazation

### Without Standardization

In [108]:
df, label, categorical_features, numerical_features = get_data(categorical_features=False, dataset_size=120000)
df[numerical_features].describe()

--------------------------------------------------------------------------------
Creating Classification Dataset...
Writing Train/Test Datasets


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7
count,96000.0,96000.0,96000.0,96000.0,96000.0,96000.0,96000.0,96000.0
mean,-1.645228,2.244024,1.045137,2.714882,-0.02022,-0.004654,-1.536356,-4.197611
std,6.209578,8.602877,1.733465,9.096954,11.988777,4.71914,2.810519,15.886376
min,-35.268886,-34.012532,-7.229214,-35.163348,-49.768093,-20.945358,-13.822828,-78.72585
25%,-5.758226,-3.452518,-0.106077,-3.56123,-8.276808,-3.241806,-3.394866,-15.068164
50%,-1.709702,2.393561,1.051668,2.78948,0.106544,-0.308207,-1.546129,-3.386493
75%,2.515567,8.106976,2.192445,9.022244,8.357289,3.115309,0.302129,7.250865
max,25.206965,42.07213,9.851492,43.896856,43.189071,21.426677,13.305723,62.477698


In [111]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

In [114]:
clf = LogisticRegression()
pipeline = Pipeline([
    ('clf', clf)
], verbose=True)

pipeline.fit(X_train[numerical_features], y_train)
evaluation(pipeline, X_test[numerical_features], y_test)

[Pipeline] ............... (step 1 of 1) Processing clf, total=   0.3s


{'auc': 0.8148784308322949, 'pr-auc': 0.818032430163559}

### With Standardization

In [115]:
num = [([n], [StandardScaler()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
], verbose=True)

pipeline.fit(X_train[numerical_features], y_train)
evaluation(pipeline, X_test[numerical_features], y_test)

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s


{'auc': 0.8148798631692816, 'pr-auc': 0.8180303186841142}

**Result**
- No need to scale for logistic regression accuracy. But convergence is faster. [More info here](https://stats.stackexchange.com/questions/48360/is-standardization-needed-before-fitting-logistic-regression#:~:text=3%20Answers&text=Standardization%20isn't%20required%20for,the%20technique%20used%20for%20optimization.&text=Otherwise%2C%20you%20can%20run%20your,standardization%20treatment%20on%20the%20features)

## 1.2 Encoding

We need numeric encoding for logistic regression.

In [3]:
df, label, categorical_features, numerical_features = get_data()

--------------------------------------------------------------------------------
Creating Classification Dataset...
Creating Categorical Features...
Writing Train/Test Datasets


### One Hot Encoding

In [10]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
cat = [([c], [OneHotEncoder()]) for c in categorical_features]
mapper = DataFrameMapper(cat + num, df_out=True)

clf = LogisticRegression(max_iter=1000)
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
], verbose=True)

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.9s


{'auc': 0.8304397645792462, 'pr-auc': 0.80297861579569}

In [11]:
preprocessed_X_test = mapper.transform(X_test)
preprocessed_X_test.head().T

Unnamed: 0,8640,8641,8642,8643,8644
x1_x0_str_0,0.0,0.0,0.0,0.0,0.0
x1_x0_str_1,0.0,0.0,0.0,0.0,0.0
x1_x0_str_2,1.0,0.0,0.0,1.0,1.0
x1_x0_str_3,0.0,1.0,1.0,0.0,0.0
x1_x0_str_4,0.0,0.0,0.0,0.0,0.0
x1_x0_str_5,0.0,0.0,0.0,0.0,0.0
x3_x0_str_0,0.0,0.0,0.0,0.0,0.0
x3_x0_str_1,0.0,0.0,1.0,0.0,0.0
x3_x0_str_2,0.0,1.0,0.0,1.0,1.0
x3_x0_str_3,1.0,0.0,0.0,0.0,0.0


### Ordinal Encoding

In [13]:
num = [([n], [SimpleImputer()]) for n in numerical_features]
cat = [([c], [OrdinalEncoder()]) for c in categorical_features]
mapper = DataFrameMapper(cat + num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
], verbose=True)

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s


{'auc': 0.8194499904512231, 'pr-auc': 0.7996358755932719}

In [145]:
preprocessed_X_test = mapper.transform(X_test)
preprocessed_X_test.head().T

Unnamed: 0,9000,9001,9002,9003,9004
feat_5,1.0,3.0,6.0,0.0,5.0
feat_6,2.0,3.0,0.0,1.0,0.0
feat_7,0.0,2.0,6.0,4.0,1.0
feat_8,4.0,7.0,7.0,0.0,5.0
feat_1,-0.068768,0.425899,1.930354,1.15798,-1.304169
feat_2,-1.222878,0.29366,1.729959,-0.716538,1.169799
feat_3,-0.714906,1.509702,-0.429593,-0.708234,-0.304866
feat_4,-0.823643,1.997845,0.105752,-0.953579,0.690543


**Result**: 
- `OrdinalEncoding` works when relationships exist between categorical variables (size, weather). Otherwise, prefer `OneHotEncoding`
- `OneHotEncoding` takes up space. Hence more training time

## 1.3 Data Imbalance

What happens if the training data isn't balanced?

### Unbalanced

In [102]:
df, label, categorical_features, numerical_features = get_data(balanced=False)

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

--------------------------------------------------------------------------------
Creating Classification Dataset...
Creating Categorical Features...
Writing Train/Test Datasets


In [103]:
df[label].value_counts()

0    8599
1    1001
Name: y, dtype: int64

In [104]:
num = [([n], [SimpleImputer()]) for n in numerical_features]
cat = [([c], [OrdinalEncoder()]) for c in categorical_features]
mapper = DataFrameMapper(cat + num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

{'auc': 0.7869518716577542, 'pr-auc': 0.39239809756882393}

In [105]:
y_predict_proba = pipeline.predict_proba(X_test)[:, 1]

In [106]:
y_predict_proba.mean()

0.10815533327119523

## Balanced

In [35]:
df, label, categorical_features, numerical_features = get_data(balanced=True)

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
cat = [([c], [OrdinalEncoder()]) for c in categorical_features]
mapper = DataFrameMapper(cat + num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

--------------------------------------------------------------------------------
Creating Classification Dataset...
Creating Categorical Features...
Writing Train/Test Datasets


{'auc': 0.7949023220244715, 'pr-auc': 0.7742073929744453}

In [36]:
y_predict_proba = pipeline.predict_proba(X_test)[:, 1]
y_predict_proba.mean()

0.4994547544271453

## Dealing with unbalanced data by over weighting

In [101]:
df, label, categorical_features, numerical_features = get_data(balanced=False)

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
cat = [([c], [OrdinalEncoder()]) for c in categorical_features]
mapper = DataFrameMapper(cat + num, df_out=True)

clf = LogisticRegression(class_weight='balanced')
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

--------------------------------------------------------------------------------
Creating Classification Dataset...
Creating Categorical Features...
Writing Train/Test Datasets


{'auc': 0.8113720373994346, 'pr-auc': 0.30360454333181025}

**Result**:
- Having an unbalanced dataset doesn't harm accuracy, but harms precision-recall metrics of the positive class.
- This is mostly due to lower predicted probability values. 

## 1.4 Correlated Features

In [72]:
df, label, categorical_features, numerical_features = get_data(categorical_features=False, correlated_features=True)

--------------------------------------------------------------------------------
Creating Classification Dataset...
Writing Train/Test Datasets


In [74]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

{'auc': 0.9194931452103352, 'pr-auc': 0.8982012865508728}

In [75]:
import statsmodels.api as sm
preprocessed_X_train = mapper.transform(X_train)
preprocessed_X_train = sm.add_constant(preprocessed_X_train)
results = sm.OLS(y_train, preprocessed_X_train).fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,1345.0
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,0.0
Time:,14:31:47,Log-Likelihood:,-3420.3
No. Observations:,8640,AIC:,6855.0
Df Residuals:,8633,BIC:,6904.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5906,0.006,104.957,0.000,0.580,0.602
x0,-0.0043,0.000,-20.533,0.000,-0.005,-0.004
x1,0.0335,0.002,19.438,0.000,0.030,0.037
x2,0.0447,0.001,43.084,0.000,0.043,0.047
x3,-0.0076,0.000,-20.533,0.000,-0.008,-0.007
x4,-0.0142,0.001,-27.006,0.000,-0.015,-0.013
x5,0.0125,0.000,45.550,0.000,0.012,0.013
x6,-0.0017,0.002,-0.997,0.319,-0.005,0.002
x7,0.0270,0.001,28.009,0.000,0.025,0.029

0,1,2,3
Omnibus:,341.439,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,353.022
Skew:,-0.467,Prob(JB):,2.2e-77
Kurtosis:,2.67,Cond. No.,1.54e+16


In [76]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
for column in numerical_features:
    print(f"""{column}, {variance_inflation_factor(
                                preprocessed_X_train.values, 
                                list(preprocessed_X_train.columns).index(column))}""")

x0, inf
x1, inf
x2, inf
x3, inf
x4, inf
x5, inf
x6, inf
x7, inf


  vif = 1. / (1. - r_squared_i)


In [89]:
df.corr()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,y
x0,1.0,0.132384,-0.097071,1.0,-0.035234,-0.162566,0.346866,0.567626,-0.000326
x1,0.132384,1.0,0.029556,0.132384,0.143301,-0.434811,0.091475,0.211035,-0.020443
x2,-0.097071,0.029556,1.0,-0.097071,0.27232,0.001597,-0.077077,-0.546263,0.275935
x3,1.0,0.132384,-0.097071,1.0,-0.035234,-0.162566,0.346866,0.567626,-0.000326
x4,-0.035234,0.143301,0.27232,-0.035234,1.0,-0.144259,0.144366,0.314752,-0.008192
x5,-0.162566,-0.434811,0.001597,-0.162566,-0.144259,1.0,0.120178,0.08333,0.544321
x6,0.346866,0.091475,-0.077077,0.346866,0.144366,0.120178,1.0,0.649177,0.30894
x7,0.567626,0.211035,-0.546263,0.567626,0.314752,0.08333,0.649177,1.0,0.071201
y,-0.000326,-0.020443,0.275935,-0.000326,-0.008192,0.544321,0.30894,0.071201,1.0


### Start with removing perfectly multicolinearity

In [90]:
numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7'] # remove x3

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}

In [91]:
preprocessed_X_train = mapper.transform(X_train)
preprocessed_X_train = sm.add_constant(preprocessed_X_train)
results = sm.OLS(y_train, preprocessed_X_train).fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,1345.0
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,0.0
Time:,14:37:17,Log-Likelihood:,-3420.3
No. Observations:,8640,AIC:,6855.0
Df Residuals:,8633,BIC:,6904.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5906,0.006,104.957,0.000,0.580,0.602
x0,-0.0172,0.001,-20.533,0.000,-0.019,-0.016
x1,0.0357,0.002,19.852,0.000,0.032,0.039
x2,0.0438,0.001,43.527,0.000,0.042,0.046
x4,-0.0138,0.001,-26.951,0.000,-0.015,-0.013
x5,0.0127,0.000,47.049,0.000,0.012,0.013
x6,-0.0005,0.002,-0.294,0.769,-0.004,0.003
x7,0.0260,0.001,28.111,0.000,0.024,0.028

0,1,2,3
Omnibus:,341.439,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,353.022
Skew:,-0.467,Prob(JB):,2.2e-77
Kurtosis:,2.67,Cond. No.,1.67e+16


In [88]:
for column in numerical_features:
    print(f"""{column}, {variance_inflation_factor(
                                preprocessed_X_train.values, 
                                list(preprocessed_X_train.columns).index(column))}""")

x0, inf
x1, inf
x2, inf
x4, inf
x5, inf
x6, inf
x7, inf


  vif = 1. / (1. - r_squared_i)


Removing feature with perfect multicolinearity:
- Improves interpretability of the coefficients (like `x0` here)
- Logistic Regression doesn't lose performance. 

## Remove multicolinearity

In [92]:
numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x7'] # remove x3, x6

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}

In [93]:
preprocessed_X_train = mapper.transform(X_train)
preprocessed_X_train = sm.add_constant(preprocessed_X_train)
results = sm.OLS(y_train, preprocessed_X_train).fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.483
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,1345.0
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,0.0
Time:,14:38:52,Log-Likelihood:,-3420.3
No. Observations:,8640,AIC:,6855.0
Df Residuals:,8633,BIC:,6904.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5906,0.006,104.957,0.000,0.580,0.602
x0,-0.0169,0.001,-17.744,0.000,-0.019,-0.015
x1,0.0366,0.005,7.684,0.000,0.027,0.046
x2,0.0434,0.001,40.460,0.000,0.041,0.045
x4,-0.0137,0.001,-25.047,0.000,-0.015,-0.013
x5,0.0128,0.000,33.306,0.000,0.012,0.014
x7,0.0256,0.001,27.862,0.000,0.024,0.027

0,1,2,3
Omnibus:,341.439,Durbin-Watson:,2.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,353.022
Skew:,-0.467,Prob(JB):,2.2e-77
Kurtosis:,2.67,Cond. No.,30.1


In [94]:
for column in numerical_features:
    print(f"""{column}, {variance_inflation_factor(
                                preprocessed_X_train.values, 
                                list(preprocessed_X_train.columns).index(column))}""")

x0, 4.358204798860465
x1, 1.8526871839909662
x2, 5.622338237184614
x4, 4.123960180952725
x5, 2.6095687697415917
x7, 10.922197872534808


Removing `x6`, we didn't lose explainability nor performance 

#### Remove x7 with high VAR

In [95]:
numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5'] # remove x3, x6, x7

train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train, y_train)
evaluation(pipeline, X_test, y_test)

{'auc': 0.8916873729387849, 'pr-auc': 0.858019953399781}

In [96]:
preprocessed_X_train = mapper.transform(X_train)
preprocessed_X_train = sm.add_constant(preprocessed_X_train)
results = sm.OLS(y_train, preprocessed_X_train).fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.437
Model:,OLS,Adj. R-squared:,0.436
Method:,Least Squares,F-statistic:,1338.0
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,0.0
Time:,14:40:42,Log-Likelihood:,-3792.2
No. Observations:,8640,AIC:,7596.0
Df Residuals:,8634,BIC:,7639.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6395,0.006,114.609,0.000,0.629,0.650
x0,0.0063,0.000,12.833,0.000,0.005,0.007
x1,0.1125,0.004,27.526,0.000,0.104,0.121
x2,0.0166,0.000,33.605,0.000,0.016,0.018
x4,-0.0007,0.000,-2.314,0.021,-0.001,-0.000
x5,0.0204,0.000,73.414,0.000,0.020,0.021

0,1,2,3
Omnibus:,381.662,Durbin-Watson:,2.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,330.564
Skew:,-0.412,Prob(JB):,1.66e-72
Kurtosis:,2.511,Cond. No.,26.7


In [97]:
for column in numerical_features:
    print(f"""{column}, {variance_inflation_factor(
                                preprocessed_X_train.values, 
                                list(preprocessed_X_train.columns).index(column))}""")

x0, 1.0434492528061576
x1, 1.2487373171729157
x2, 1.089610333638892
x4, 1.1174255753042328
x5, 1.2630916367080673


Removing `x7`:
- Helped explainability 
- Negatively impacted performance

Remedy: Add polynomial terms, Try other models that capture more complex interactions. 

## 1.5 Missing Values

In [98]:
df, label, categorical_features, numerical_features = get_data(missing_values=True)

--------------------------------------------------------------------------------
Creating Classification Dataset...
Creating Categorical Features...
Writing Train/Test Datasets


In [99]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features]
mapper = DataFrameMapper(num, df_out=True)

clf = LogisticRegression()
pipeline = Pipeline([
    #('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train[numerical_features], y_train)
evaluation(pipeline, X_test[numerical_features], y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [100]:
train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)
X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]
X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]

num = [([n], [SimpleImputer()]) for n in numerical_features] # Impute values
mapper = DataFrameMapper(num, df_out=True)
                                                                                                                                                           
clf = LogisticRegression()
pipeline = Pipeline([
    ('preprocess', mapper),
    ('clf', clf)
])

pipeline.fit(X_train[numerical_features], y_train)
evaluation(pipeline, X_test[numerical_features], y_test)

{'auc': 0.7473034970984109, 'pr-auc': 0.676792150205654}

**Result**
- Logistic Regression can't handle missing values. Best Imupute with mean

## Summary 

Let's see how Logistic Regression acts with 5 techniques:
1. **Standardization of Numerical Variables**
    - Performance doesn't necessarily improve. But convergence is faster during training
2. **Encoding of Categorical Variables**
    - We can use ordinal encoding if the categories are related (size). Otherwise, use one hot encoding
3. **Data Imbalance**
    - Perform overweighting of the minor class and undersampling of the major class
4. **Colinearity**
    - remove features which exhibit perfect multicolinearity
    - try different modeling strategies to ensure the model is capturing non-linear interactions
5. **Missing Values**
    - Impute with mean (or a constant value). This is problem specific