In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv')
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [3]:
# CREATING THE TARGET VARIABLE:
card_values = {
    "yes": 1,
    "no": 0
}
df["card"] = df.card.map(card_values)
df.head(10)

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,1,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,1,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,1,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,1,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,1,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5
5,1,0,23.25,2.5,0.044438,91.99667,no,no,0,54,1,1
6,1,0,27.91667,3.96,0.012576,40.83333,no,no,2,7,1,5
7,1,0,29.16667,2.37,0.076434,150.79,yes,no,0,77,1,3
8,1,0,37.0,3.8,0.245628,777.8217,yes,no,0,97,1,6
9,1,0,28.41667,3.2,0.01978,52.58,no,no,0,65,1,18


In [4]:
# INITIALIZING NUMERICAL AND CATEGORICAL VARIABLES:
numerical = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active"]
categorical = ["owner", "selfemp"]

In [5]:
# SPLITTING THE DATASET

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# TRAINING LOGISTIC REGRESSION MODEL

In [6]:
columns = categorical + numerical
def train(df_train, y_train, C=1.0):
    dicts = df_train[columns].to_dict(orient='records')


    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model


def predict(df, dv, model):
    dicts = df[columns].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [7]:
C = 1.0
n_splits = 5

In [8]:
#@ INITIALIZING KFOLD CROSS VALIDATION:
scores = []

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.card
    y_val = df_val.card

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('%.3f +- %.3f' % (np.mean(scores), np.std(scores)))

0.996 +- 0.003


In [9]:
dv, model = train(df_full_train, df_full_train.card.values, C=1.0)
y_pred = predict(df_test, dv, model)

y_test = df_test.card.values
auc = roc_auc_score(y_test, y_pred)
auc

0.996097337006428

In [10]:
import pickle

In [11]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [12]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)