In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv

--2022-10-10 12:29:03--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AER_credit_card_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73250 (72K) [text/plain]
Saving to: ‘AER_credit_card_data.csv.1’


2022-10-10 12:29:03 (3.23 MB/s) - ‘AER_credit_card_data.csv.1’ saved [73250/73250]



In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('AER_credit_card_data.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
card,yes,yes,yes,yes,yes
reports,0,0,0,0,0
age,37.66667,33.25,33.66667,30.5,32.16667
income,4.52,2.42,4.5,2.54,9.7867
share,0.03327,0.005217,0.004156,0.065214,0.067051
expenditure,124.9833,9.854167,15.0,137.8692,546.5033
owner,yes,no,yes,no,yes
selfemp,no,no,no,no,no
dependents,3,3,4,0,2
months,54,34,58,25,64


In [4]:
numerical = ['reports', 'age', 'income', 'share', 'expenditure', 'dependents', 'months', 'active']
categorical = ['owner', 'selfemp', 'majorcards']

In [5]:
df['card_target'] = (df['card'] == 'yes').astype(int)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [9]:
from sklearn.feature_extraction import DictVectorizer

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
features = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active", "owner", "selfemp"]

In [12]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[features].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)

    return dv, model

def predict(df, dv, model):
    dicts = df[features].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [13]:
n_splits = 5
C = 1.0

In [14]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.card_target.values
    y_val = df_val.card_target.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.5f +- %.3f' % (C, np.mean(scores), np.std(scores)))


C=1.0 0.99645 +- 0.003


In [15]:
scores

[0.9941860465116279,
 0.9941176470588236,
 0.9943346508563901,
 0.9996107939802803,
 1.0]

In [16]:
import pickle

In [17]:
out_file = f'model_C={C}.bin'
out_file

'model_C=1.0.bin'

In [18]:
with open(out_file, 'wb') as of:
    pickle.dump((dv, model), of)