In [13]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
import joblib

pd.set_option('display.max_columns', None)

In [10]:
raw_train=pd.read_csv('../data/exercise_26_train.csv')
raw_test=pd.read_csv('../data/exercise_26_test.csv')

In [11]:
train_data = raw_train.copy(deep=True)
# DATA PREP
# Fixing the money and percents#
train_data['x12'] = train_data['x12'].str.replace('$','')
train_data['x12'] = train_data['x12'].str.replace(',','')
train_data['x12'] = train_data['x12'].str.replace(')','')
train_data['x12'] = train_data['x12'].str.replace('(','-')
train_data['x12'] = train_data['x12'].astype(float)
train_data['x63'] = train_data['x63'].str.replace('%','')
train_data['x63'] = train_data['x63'].astype(float)

# With mean imputation
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
train_all_imputed = pd.DataFrame(imputer.fit_transform(train_data.drop(columns=['y', 'x5', 'x31',  'x81' ,'x82'])), 
                                 columns=train_data.drop(columns=['y', 'x5', 'x31', 'x81', 'x82']).columns)
std_scaler = StandardScaler()
train_all_std = pd.DataFrame(std_scaler.fit_transform(train_all_imputed), columns=train_all_imputed.columns)

# Ceate dummies
dumb5 = pd.get_dummies(train_data['x5'], drop_first=True, prefix='x5', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb5], axis=1, sort=False)

dumb31 = pd.get_dummies(train_data['x31'], drop_first=True, prefix='x31', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb31], axis=1, sort=False)

dumb81 = pd.get_dummies(train_data['x81'], drop_first=True, prefix='x81', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb81], axis=1, sort=False)

dumb82 = pd.get_dummies(train_data['x82'], drop_first=True, prefix='x82', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb82], axis=1, sort=False)
train_all = pd.concat([train_all_std, train_data['y']], axis=1, sort=False)

# INITIAL FEATURE SELECTION
exploratory_LR = LogisticRegression(penalty='l1', fit_intercept=False, solver='liblinear')
exploratory_LR.fit(train_all.drop(columns=['y']), train_all['y'])
exploratory_results = pd.DataFrame(train_all.drop(columns=['y']).columns).rename(columns={0:'name'})
exploratory_results['coefs'] = exploratory_LR.coef_[0]
exploratory_results['coefs_squared'] = exploratory_results['coefs']**2
var_reduced = exploratory_results.nlargest(25,'coefs_squared')
variables = var_reduced['name'].to_list()

# Convert boolean columns to numeric
for col in variables:
    if train_all[col].dtype == 'bool':
        train_all[col] = train_all[col].astype(int)

# Final model
final_logit = sm.Logit(train_all['y'], train_all[variables])
final_result = final_logit.fit()

Optimization terminated successfully.
         Current function value: 0.536451
         Iterations 6


In [12]:
test_data = raw_test.copy(deep=True)
# DATA PREP
# Fixing the money and percents#
test_data['x12'] = test_data['x12'].str.replace('$','')
test_data['x12'] = test_data['x12'].str.replace(',','')
test_data['x12'] = test_data['x12'].str.replace(')','')
test_data['x12'] = test_data['x12'].str.replace('(','-')
test_data['x12'] = test_data['x12'].astype(float)
test_data['x63'] = test_data['x63'].str.replace('%','')
test_data['x63'] = test_data['x63'].astype(float)

# With mean imputation
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
train_all_imputed = pd.DataFrame(imputer.fit_transform(test_data.drop(columns=['x5', 'x31',  'x81' ,'x82'])), 
                                 columns=test_data.drop(columns=['x5', 'x31', 'x81', 'x82']).columns)
std_scaler = StandardScaler()
train_all_std = pd.DataFrame(std_scaler.fit_transform(train_all_imputed), columns=train_all_imputed.columns)

# Ceate dummies
dumb5 = pd.get_dummies(test_data['x5'], drop_first=True, prefix='x5', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb5], axis=1, sort=False)

dumb31 = pd.get_dummies(test_data['x31'], drop_first=True, prefix='x31', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb31], axis=1, sort=False)

dumb81 = pd.get_dummies(test_data['x81'], drop_first=True, prefix='x81', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb81], axis=1, sort=False)

dumb82 = pd.get_dummies(test_data['x82'], drop_first=True, prefix='x82', prefix_sep='_', dummy_na=True)
train_all = pd.concat([train_all_std, dumb82], axis=1, sort=False)

train_all = train_all[variables]
# Convert boolean columns to numeric
for col in variables:
    if train_all[col].dtype == 'bool':
        train_all[col] = train_all[col].astype(int)

probs = final_result.predict(train_all)
predicted_class = (probs > 0.5).astype(int) # whatever threshold you want
print(probs)
print(predicted_class)

0       0.366958
1       0.824429
2       0.134487
3       0.471328
4       0.323100
          ...   
9995    0.630504
9996    0.438068
9997    0.658542
9998    0.161555
9999    0.744278
Length: 10000, dtype: float64
0       0
1       1
2       0
3       0
4       0
       ..
9995    1
9996    0
9997    1
9998    0
9999    1
Length: 10000, dtype: int64
