In [None]:
import pandas
import numpy
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
df = pandas.read_csv("../../data/raw/churn.csv")
df.head()

In [None]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ", "_")

df.totalcharges = pandas.to_numeric(df.totalcharges, errors="coerce")
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == "yes").astype(int)
#
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train["churn"]
del df_val["churn"]
del df_test["churn"]
#
numerical = ["tenure", "monthlycharges", "totalcharges"]

categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

#
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

#
val_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = y_pred >= 0.5
(y_val == churn_decision).mean()

# Accuracy

the fraction of correct preditions


In [None]:
len(y_val)

In [None]:
(y_val == churn_decision).sum()

In [None]:
# the accuracy
from sklearn.metrics import accuracy_score

print(1132 / 1409)

# or
print((y_val == churn_decision).mean())

# or
accuracy_score(y_val, churn_decision)

In [None]:
# find the best value of decision cond.
from sklearn.metrics import accuracy_score

thresholds = numpy.linspace(0, 1, 21)
scores = []

for t in thresholds:
    churn_decision = y_pred >= t
    score = accuracy_score(y_val, churn_decision)
    print("%.2f %.3f" % (t, score))
    scores.append(score)

plt.plot(thresholds, scores)
# the best threshold is 0.50 0.803

In [None]:
# dummy model
from collections import Counter

Counter(y_pred >= 1)

# Confusion Table

evaluate the quality of the model by different strategies.

When comes to a prediction of an LR model, each falls into one of four different categories:

Prediction is that the customer WILL churn. This is known as the Positive class
And Customer actually churned - Known as a True Positive (TP)
But Customer actually did not churn - Known as a False Positive (FP)
Prediction is that the customer WILL NOT churn' - This is known as the Negative class
Customer did not churn - True Negative (TN)
Customer churned - False Negative (FN)


In [None]:
# people who are going to churn
actual_positive = y_val == 1
# people who are not going to churn
actual_negative = y_val == 0

In [None]:
t = 0.5
predict_positive = y_pred >= t
predict_negative = y_pred < t

In [None]:
predict_positive & actual_positive

tp = (predict_positive & actual_positive).sum()
print(tp)
tn = (predict_negative & actual_negative).sum()
print(tn)

fp = (predict_positive & actual_negative).sum()
print(fp)
fn = (predict_negative & actual_positive).sum()
print(fn)

In [None]:
confusion_matrix = numpy.array([[tn, fp], [fn, tp]])

confusion_matrix

In [None]:
(confusion_matrix / confusion_matrix.sum()).round(2)

# Precision and Recall

(eval. matrix)

Precision : From the predicted positives, how many we predicted right.

Recall : From the real positives, how many we predicted right.


In [None]:
(tp + tn) / (tp + tn + fp + fn)

In [None]:
prec = tp / (tp + fp)
prec

In [None]:
print(tp)
print(tp + fp)

# 210/311
# explain: 67% are correct,33% are mistiks(we pred them to churn but they are not churned)

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
print(tp)
print(tp + fn)

# 210/386
# explain: 54% are correct,46% are mistiks(we pred them to not churn but they are churned)
# (so the accuracy 80% was misleading)

# ROC Curves

ROC curves consider Recall and FPR under all the possible thresholds


In [None]:
# tpr:true positive rate
tpr = tp / (tp + fn)
tpr

In [None]:
recall

In [None]:
# fpr: false positive rate
fpr = fp / (fp + tn)
fpr

In [None]:
def tpr_fpr_dataframe(y_val, y_pred):
    scores = []
    thresholds = numpy.linspace(0, 1, 101)

    for t in thresholds:
        actual_positive = y_val == 1
        actual_negative = y_val == 0

        predict_positive = y_pred >= t
        predict_negative = y_pred < t

        tp = (predict_positive & actual_positive).sum()
        tn = (predict_negative & actual_negative).sum()

        fp = (predict_positive & actual_negative).sum()
        fn = (predict_negative & actual_positive).sum()

        scores.append((t, tp, tn, fp, fn))

    scores

    columns = ["threshold", "tp", "tn", "fp", "fn"]
    df_scores = pandas.DataFrame(scores, columns=columns)
    return df_scores


df_scores = tpr_fpr_dataframe(y_val, y_pred)
df_scores

In [None]:
df_scores[::10]

In [None]:
df_scores["tpr"] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores["fpr"] = df_scores.fp / (df_scores.fp + df_scores.tn)
df_scores[::10]

In [None]:
plt.plot(df_scores.threshold, df_scores["tpr"], label="TPR")
plt.plot(df_scores.threshold, df_scores["fpr"], label="FPR")
plt.legend()

# Random model

baseline model


In [None]:
from sklearn.metrics import roc_curve


numpy.random.seed(1)
y_rand = numpy.random.uniform(0, 1, size=len(y_val))
y_rand.round(3)

# Accuracy for our random model is around 50%
((y_rand >= 0.5) == y_val).mean()

random_model_fpr, random_model_tpr, random_model_threshold = roc_curve(y_val, y_rand)

plt.plot(random_model_threshold, random_model_tpr, label="TPR")
plt.plot(random_model_threshold, random_model_fpr, label="FPR")
plt.legend()

Ideal model
(makes correct predictions for every example)

1. order the pred from lowest to highest


In [None]:
num_neg = (y_val == 0).sum()
num_pos = (y_val == 1).sum()
num_neg, num_pos

In [None]:
y_ideal = numpy.repeat([0, 1], [num_neg, num_pos])
y_ideal

In [None]:
y_ideal_pred = numpy.linspace(0, 1, len(y_ideal))
y_ideal_pred

In [None]:
y_val.mean()

In [None]:
accuracy_ideal = ((y_ideal_pred >= 0.726) == y_ideal).mean()
accuracy_ideal

In [None]:
ideal_fpr, ideal_tpr, ideal_threshold = roc_curve(y_ideal, y_ideal_pred)

plt.plot(ideal_threshold, ideal_tpr, label="TPR")
plt.plot(ideal_threshold, ideal_fpr, label="FPR")
plt.legend()

Putting everything together


In [None]:
plt.plot(df_scores.threshold, df_scores["tpr"], label="TPR")
plt.plot(df_scores.threshold, df_scores["fpr"], label="FPR")

# plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR')
# plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR')

plt.plot(ideal_threshold, ideal_tpr, label="ideal_TPR", color="black")
plt.plot(ideal_threshold, ideal_fpr, label="ideal_FPR", color="green")

plt.legend()

In [None]:
# the Roc curve
plt.figure(figsize=(5, 5))

plt.plot(df_scores.fpr, df_scores.tpr, label="model")
plt.plot([0, 1], [0, 1], label="random")
# plt.plot(df_rand.fpr, df_rand.tpr, label='random')
plt.plot(ideal_fpr, ideal_tpr, label="ideal")

plt.xlabel("FPR")
plt.ylabel("TPR")

plt.legend()

# ROC AUC (measure the performance of the model)

(calculate the area under the curve)


In [None]:
from sklearn.metrics import auc

auc(df_scores.fpr, df_scores.tpr)

In [None]:
auc(ideal_fpr, ideal_tpr)

In [None]:
# AUC interpretation
# (AUC tells us the probability that a randomly selected positive example has a score that is higher than a randomly selected negative example)

neg = y_pred[y_val == 0]
pos = y_pred[y_val == 1]

n = 50000

numpy.random.seed(1)
pos_ind = numpy.random.randint(0, len(pos), size=n)
neg_ind = numpy.random.randint(0, len(neg), size=n)
pos[pos_ind] > neg[neg_ind]

(pos[pos_ind] > neg[neg_ind]).mean()

# K-Fold Cross-Validation
(eval. matrix)
(Evaluating the same model on different subsets of data)


In [None]:
def train(df_train, y_train):
    dicts = df_train[categorical + numerical].to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(max_iter=5000)
    model.fit(X_train, y_train)

    return dv, model


dv, model = train(df_train, y_train)


def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient="records")

    X = dv.fit_transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred


y_pred = predict(df_val, dv, model)
y_pred

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=1)

kfold.split(df_full_train)

train_idx, val_idx = next(kfold.split(df_full_train))
print(len(train_idx), len(val_idx))

print(len(df_full_train))

# We can use iloc to select a part of this dataframe
df_train = df_full_train.iloc[train_idx]
df_val = df_full_train.iloc[val_idx]

In [None]:
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = []

for train_idx, val_idx in tqdm(kfold.split(df_full_train)):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

scores

In [None]:
# We can utilize the scores generated to compute the average score across the 10 folds, with a standard deviation .

print("%.3f +- %.3f" % (numpy.mean(scores), numpy.std(scores)))

# Parameter Tuning
(regularization parameter)

In [None]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
 
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
 
    model = LogisticRegression(C=C, max_iter=5000)
    model.fit(X_train, y_train)
 
    return dv, model

dv, model = train(df_train, y_train, C=0.001)

In [None]:
# C cannot be 0.0 will cause error

from sklearn.model_selection import KFold
 
kfold = KFold(n_splits=5, shuffle=True, random_state=1)  
 
for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
     
    scores = []
 
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
 
        y_train = df_train.churn.values
        y_val = df_val.churn.values
 
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)
 
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)
 
    print('C=%s %.3f +- %.3f' % (C, numpy.mean(scores), numpy.std(scores)))
 

In [None]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)
 
auc = roc_auc_score(y_test, y_pred)
auc