In [6]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import statsmodels.api as sm
import joblib

In [7]:
raw_train=pd.read_csv('../data/exercise_26_train.csv')
raw_test=pd.read_csv('../data/exercise_26_test.csv')

In [8]:
# model = joblib.load('model.pkl')
# variables = joblib.load('variables.pkl')

In [9]:
sample_data = {
    "x0": "-1.018506",
    "x1": "-4.180869",
    "x2": "5.703058724",
    "x3": "-0.522021597",
    "x4": "-1.678553956",
    "x5": "tuesday",
    "x6": "0.18617",
    "x7": "30.162959",
    "x8": "1.200073",
    "x9": "0.373124",
    "x10": "14.973894",
    "x11": "-0.81238",
    "x12": "$6,882.34 ",
    "x13": "0.078341",
    "x14": "32.823072",
    "x15": "0.02048",
    "x16": "0.171077",
    "x17": "14.236199",
    "x18": "-18.646051",
    "x19": "0.575313",
    "x20": "0.068703",
    "x21": "-0.276702",
    "x22": "0.754378",
    "x23": "3.103192",
    "x24": "-101.889723",
    "x25": "1.49565",
    "x26": "3.412199",
    "x27": "0.601394",
    "x28": "14.210012",
    "x29": "0.558285",
    "x30": "4.21066",
    "x31": "germany",
    "x32": "0.07303966",
    "x33": "2.99793546",
    "x34": "-1.91981754",
    "x35": "1.11327381",
    "x36": "-0.75988365",
    "x37": "3.00740356",
    "x38": "-1.76639977",
    "x39": "-1.93067723",
    "x40": "288.2",
    "x41": "129.79",
    "x42": "366.71",
    "x43": "-1134.56",
    "x44": "0.98441208",
    "x45": "1.10833973",
    "x46": "0.495749506",
    "x47": "0.422930348",
    "x48": "1.628712455",
    "x49": "0.402797858",
    "x50": "-0.272326826",
    "x51": "1.48269105",
    "x52": "-2.095101799",
    "x53": "0.33612654",
    "x54": "0.39604464",
    "x55": "0.43767884",
    "x56": "0.137700027",
    "x57": "0.53142961",
    "x58": "0.228881625",
    "x59": "-0.222421763",
    "x60": "0.561192069",
    "x61": "1.129407195",
    "x62": "0.373941237",
    "x63": "62.59%",
    "x64": "33.79248734",
    "x65": "-0.1522697",
    "x66": "0.34106988",
    "x67": "14.39211979",
    "x68": "-20.60214825",
    "x69": "0.02168046",
    "x70": "0.12436805",
    "x71": "2.80831588",
    "x72": "0.48941937",
    "x73": "3.07847637",
    "x74": "-86.44286813",
    "x75": "0.4088527",
    "x76": "",
    "x77": "0.80646678",
    "x78": "14.02814387",
    "x79": "0.12779922",
    "x80": "3.25437849",
    "x81": "April",
    "x82": "Female",
    "x83": "0.460470644",
    "x84": "-1.129221693",
    "x85": "-0.124149454",
    "x86": "-1.650432198",
    "x87": "-1.295166064",
    "x88": "0.076903248",
    "x89": "-1.123881898",
    "x90": "0.323156018",
    "x91": "0.04191",
    "x92": "0.33889244",
    "x93": "3.52499912",
    "x94": "-97.7151381",
    "x95": "1.44463704",
    "x96": "2.72855326",
    "x97": "0.71872513",
    "x98": "-32.94590765",
    "x99": "2.55535888"
}


In [10]:
train_data = raw_train.copy(deep=True)
# DATA PREP
# Fixing the money and percents#
train_data['x12'] = train_data['x12'].str.replace('$','')
train_data['x12'] = train_data['x12'].str.replace(',','')
train_data['x12'] = train_data['x12'].str.replace(')','')
train_data['x12'] = train_data['x12'].str.replace('(','-')
train_data['x12'] = train_data['x12'].astype(float)
train_data['x63'] = train_data['x63'].str.replace('%','')
train_data['x63'] = train_data['x63'].astype(float)

# With mean imputation
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
train_all_imputed = pd.DataFrame(imputer.fit_transform(train_data.drop(columns=['y', 'x5', 'x31',  'x81' ,'x82'])), columns=train_data.drop(columns=['y', 'x5', 'x31', 'x81', 'x82']).columns)
std_scaler = StandardScaler()
train_all_std = pd.DataFrame(std_scaler.fit_transform(train_all_imputed), columns=train_all_imputed.columns)

# Ceate dummies
dumb5 = pd.get_dummies(train_data['x5'], drop_first=True, prefix='x5', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb5], axis=1, sort=False)

dumb31 = pd.get_dummies(train_data['x31'], drop_first=True, prefix='x31', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb31], axis=1, sort=False)

dumb81 = pd.get_dummies(train_data['x81'], drop_first=True, prefix='x81', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb81], axis=1, sort=False)

dumb82 = pd.get_dummies(train_data['x82'], drop_first=True, prefix='x82', prefix_sep='_', dummy_na=True)
train_all_std = pd.concat([train_all_std, dumb82], axis=1, sort=False)
train_all = pd.concat([train_all_std, train_data['y']], axis=1, sort=False)


# INITIAL FEATURE SELECTION
exploratory_LR = LogisticRegression(penalty='l1', fit_intercept=False, solver='liblinear')
exploratory_LR.fit(train_all.drop(columns=['y']), train_all['y'])
exploratory_results = pd.DataFrame(train_all.drop(columns=['y']).columns).rename(columns={0:'name'})
exploratory_results['coefs'] = exploratory_LR.coef_[0]
exploratory_results['coefs_squared'] = exploratory_results['coefs']**2
var_reduced = exploratory_results.nlargest(25,'coefs_squared')
variables = var_reduced['name'].to_list()


# Convert boolean columns to numeric
for col in variables:
    if train_all[col].dtype == 'bool':
        train_all[col] = train_all[col].astype(int)


# Final model
final_logit = sm.Logit(train_all['y'], train_all[variables])
final_result = final_logit.fit()

Optimization terminated successfully.
         Current function value: 0.536451
         Iterations 6


In [None]:
import joblib

In [13]:
variables = joblib.load('../models/variables.pkl')
variables

['x5_saturday',
 'x81_July',
 'x81_December',
 'x31_japan',
 'x5_sunday',
 'x81_October',
 'x31_asia',
 'x91',
 'x81_February',
 'x81_May',
 'x5_monday',
 'x53',
 'x81_March',
 'x81_September',
 'x44',
 'x81_November',
 'x12',
 'x81_June',
 'x81_August',
 'x5_tuesday',
 'x62',
 'x81_January',
 'x58',
 'x31_germany',
 'x56']

In [14]:
raw_test.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x90,x91,x92,x93,x94,x95,x96,x97,x98,x99
0,0.042317,-3.344721,4.635124,-0.598396,-0.647772,monday,0.184902,46.690015,3.034132,0.364704,...,-0.493304,0.373853,0.941435,3.546798,-99.857488,0.403926,1.653787,0.007715,-32.021646,-60.312783
1,-1.03316,-0.34014,5.871823,,0.122133,tuesday,0.997773,51.581411,1.709219,0.844079,...,0.521119,0.148424,0.925301,3.830426,-101.105748,0.055775,0.56489,0.051716,-32.540612,-266.725795
2,2.029367,-3.239301,4.724436,2.211831,0.551611,tuesday,0.492405,87.179042,4.333755,0.513789,...,0.154492,-0.034504,0.904042,3.642968,-107.476487,1.046718,1.494123,0.231084,-32.740954,-4.327887
3,-0.065676,1.892277,4.818741,0.640313,1.944562,friday,0.208718,73.573314,4.929132,0.116004,...,0.305243,-0.099213,0.712234,3.853489,-91.650053,0.499861,2.804358,0.627921,-32.190043,103.192597
4,-0.357126,-1.852161,5.367849,-0.069869,-0.641455,saturday,0.940286,72.773335,,0.191044,...,0.617258,0.307445,0.376738,3.306958,-99.55714,1.275527,1.476482,0.122798,-32.957087,-111.509168


In [19]:
raw_train['x81']

0           April
1        December
2             May
3        December
4             May
           ...   
39995    February
39996     October
39997    December
39998        June
39999    December
Name: x81, Length: 40000, dtype: object

In [17]:
for i in raw_train.columns:
    print(i)

x0
x1
x2
x3
x4
x5
x6
x7
x8
x9
x10
x11
x12
x13
x14
x15
x16
x17
x18
x19
x20
x21
x22
x23
x24
x25
x26
x27
x28
x29
x30
x31
x32
x33
x34
x35
x36
x37
x38
x39
x40
x41
x42
x43
x44
x45
x46
x47
x48
x49
x50
x51
x52
x53
x54
x55
x56
x57
x58
x59
x60
x61
x62
x63
x64
x65
x66
x67
x68
x69
x70
x71
x72
x73
x74
x75
x76
x77
x78
x79
x80
x81
x82
x83
x84
x85
x86
x87
x88
x89
x90
x91
x92
x93
x94
x95
x96
x97
x98
x99
y
