In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


In [None]:
df = pd.read_csv('../WEEK 3/data-week-3.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)


In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']


In [None]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]


In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
(y_val == churn_decision).mean()


In [None]:
from sklearn.metrics import accuracy_score

In [None]:
t=0.5

In [None]:
accuracy_score(y_val,y_pred >= t)

In [None]:
thresholds = np.linspace(0,1,21)

scores = []

for t in thresholds:
    score = accuracy_score(y_val,y_pred >= t)
    print("%.2f %.3f"%(t,score))
    scores.append(score) # append to scores array




In [None]:
plt.plot(thresholds,scores)

In [None]:
from collections import Counter # Counting yes/no chourning 
Counter(y_val)

# Class imbalance (much more true values than false ones)

In [None]:
actual_positive = (y_val ==1 )
actual_negative = (y_val ==0 )


In [None]:
threshold = 0.5

predict_positive = (y_pred >= threshold)
predict_negative = (y_pred < threshold)

In [None]:
results = Counter(predict_positive & actual_positive)

In [None]:
fig = plt.figure(figsize =(10, 7))
plt.pie([float(v) for v in results.values()], labels = ['False','True'],pctdistance=1.1, labeldistance=1.2,autopct='%1.0f%%')


In [None]:
# Creating true positives and true negatives
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()

# False positives and false negatives
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()


In [None]:
fp,fn

In [None]:
#Creating the confusion matrix
confusion_matrix = np.array([
    [tn,fp],
    [fn,tp]
])

In [None]:
( confusion_matrix / confusion_matrix.sum() ).round(2) # PORCENTAGE OF CORRECT 

In [None]:
(tp + tn) / ( tp + tn + fp + fn)

# 4.4 precision and Recall

## Precision
- Fraction of positive predictions that are correct.

$$P=\frac{TP}{TP+FP}$$


## Recall
- Fraction of correctly identified positive examples.

- We will only use the $g(x_i) \geq t$ (FN) and y=1 (TP) part of the data set

$$R= \frac{TP}{ \# Positive (OBS)} =\frac{TP}{TP+FN} $$
$$R= \frac{3}{4} = 75 \%$$

In [None]:
recall = tp/ (tp + fn)

In [None]:
recall

In [None]:
tp + fn

# 4.5 ROC CURVES

$$ FPR = \frac{FP}{TN+FP} $$


$$TPR= \frac{TP}{FN+TP}$$

- Minimize FP
- Max TP

In [None]:
tpr = tp/(tp+fn)
tpr # same as recall

In [None]:
fpr = fp/(fp+tn)
fpr

In [None]:
scores = []

thresholds = np.linspace(0,1,101)

for t in thresholds:
    actual_positive = (y_val ==1 )

    actual_negative = (y_val ==0 )
    
    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)


    # Creating true positives and true negatives
    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()

    # False positives and false negatives
    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()
    #append tuple of true positives and false negatives

    scores.append((t,tp,fp,fn,tn))

In [None]:
columns = ['threshold', 'tp','fp','fn','tn']
df_scores = pd.DataFrame(scores,columns=columns) # Creating a data frame with pandas

In [None]:
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)

df_scores[::10] # Each tenth record

In [None]:

# Ploting the results

plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR',color="green")
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR',color="red")
plt.legend()

# Minimize FPR , Maximize TPR

# Random Model

In [None]:
np.random.seed(1)
y_rand = np.random.uniform(0,1,size=len(y_val))

y_rand.round(3)

In [None]:
((y_rand >= 0.5) == y_val).mean()

In [None]:
def tpr_fpr_dataframe(y_val,y_pred):
    scores = []
    thresholds = np.linspace(0,1,101)
    for t in thresholds:
        actual_positive = (y_val ==1 )
        actual_negative = (y_val ==0 )
        predict_positive = (y_pred >= t)
        predict_negative = (y_pred < t)

        # Creating true positives and true negatives
        tp = (predict_positive & actual_positive).sum()
        tn = (predict_negative & actual_negative).sum()

        # False positives and false negatives
        fp = (predict_positive & actual_negative).sum()
        fn = (predict_negative & actual_positive).sum()
        #append tuple of true positives and false negatives

        scores.append((t,tp,fp,fn,tn))
    columns = ['threshold', 'tp','fp','fn','tn']
    df_scores = pd.DataFrame(scores,columns=columns) # Creating a data frame with pandas    
    df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
    df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)

    return df_scores


In [None]:
df_rand = tpr_fpr_dataframe(y_val,y_rand)

In [None]:
df_rand[::10]

In [None]:
plt.plot(df_scores.threshold, df_rand['tpr'], label='TPR',color="green")
plt.plot(df_scores.threshold, df_rand['fpr'], label='FPR',color="red")
plt.legend()

In [None]:
num_neg = (y_val == 0 ).sum()
num_pos = (y_val == 1 ).sum()
num_pos,num_neg

In [None]:
y_ideal=np.repeat([0,1],[num_neg,num_pos])

y_ideal

In [None]:
y_ideal_pred = np.linspace(0,1,len(y_val))

In [None]:
( (y_ideal_pred >= 0.726) == y_ideal ).mean() # perfect model

In [None]:
df_ideal = tpr_fpr_dataframe(y_ideal,y_ideal_pred)

In [None]:
df_ideal

In [None]:
plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR', color='black')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR', color='blue')

plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR ideal')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR ideal')

# plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR random', color='grey')
# plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR random', color='grey')

plt.legend()

In [None]:
plt.figure(figsize=(5,5))

plt.plot(df_scores.fpr,df_scores.tpr, label='model')
plt.plot([0,1],[0,1], label='random')
plt.plot(df_ideal.fpr,df_ideal.tpr, label='ideal')

plt.xlabel('FPR')
plt.ylabel('TPR')


print("Curve should be as close as possible to the ideal model")

plt.legend()

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred)

plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

In [None]:
from sklearn.metrics import auc # for any curve 

auc(fpr,tpr)

In [None]:
auc(df_ideal.fpr,df_ideal.tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val,y_pred)

In [None]:
import random 

neg = y_pred[y_val == 0 ]
pos = y_pred[y_val == 1 ]

n= 100000
success= 0

for i in range(n):
    pos_ind = random.randint(0,len(pos)-1)
    neg_ind = random.randint(0,len(neg)-1)
    if pos[pos_ind] > neg[neg_ind]:
        success= success + 1


success/n


In [None]:
n= 50000
np.random.seed(1)
pos_ind = np.random.randint(0,len(pos),size=n)
neg_ind = np.random.randint(0,len(neg),size=n)

In [None]:

(pos[pos_ind] > neg[neg_ind]).mean()

Counter(pos[pos_ind] > neg[neg_ind])

In [None]:

(pos[pos_ind] > neg[neg_ind]).mean()

# Cross-Validation

- Splitting the data into 2 parts ( full_train and test )

In [80]:
def train(df_train, y_train, C=1.0):

    dicts = df_train[categorical + numerical].to_dict(orient='records')
    dv= DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    model = LogisticRegression(C=C ,max_iter=1000) # adding parameter
    model.fit(X_train,y_train)
    return dv,model

In [None]:
dv,model = train(df_train,y_train,C=0.0001) # Smaller C means stronger Regularization

In [None]:
def predict(df,dv,model):
    dicts = df[categorical + numerical].to_dict(orient='records')
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

In [None]:
y_pred = predict(df_val,dv,model)

In [None]:
from sklearn.model_selection import KFold

# take the data set,  and splitting it into 10 parts

In [None]:
kfold = KFold(n_splits=10,shuffle=True,random_state=1)

In [None]:
train_idx,val_idx= next(kfold.split(df_full_train))

In [None]:
len(train_idx),len(val_idx)

In [None]:
len(df_full_train)

In [None]:
from tqdm.auto import tqdm

In [None]:
df_train = df_full_train.iloc[train_idx]
df_val = df_full_train.iloc[val_idx]


n_splits=5


for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
    scores= []
    for train_idx,val_idx in kfold.split(df_full_train):

        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
        y_train = df_train.churn.values
        y_val = df_val.churn.values
        dv,model = train(df_train,y_train,C=C)
        y_pred = predict(df_val,dv,model)
        auc= roc_auc_score(y_val,y_pred)    
        scores.append(auc)
    print("C=%s %.3f + - %.3f " %(C,np.mean(scores),np.std(scores)))

In [81]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8572386167896259

In [77]:
df_full_train.shape

(5634, 21)

In [73]:
df_full_train.churn.values.shape

(5634,)

In [82]:
round(np.mean(scores),3),round(np.std(scores),3)


(0.841, 0.007)