<a href="https://colab.research.google.com/github/carlos-alves-one/-ML-Zoomcamp-Week-4/blob/main/ML_Zoomcamp_Week_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goldsmiths University of London
**Author....: Carlos Manuel de Oliveira Alves**<br>
**Student..: cdeol003**<br>
**Created..: 27/09/2022**

In [23]:
# Import libraries for the project
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import the library warnings to ignore the warnings
import warnings
warnings.filterwarnings("ignore")

In [24]:
# Lets train the model again first - to use its results later in this notebook

# Import packages from Sklearn for the project
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [25]:
# Data import and preparation

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

df.churn = (df.churn == 'yes').astype(int)

In [26]:
# Setting up the validation framework

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

In [27]:
# Define the list of categorical and numerical variables

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [28]:
# Turn the data into a dictionary and we want each row to turn into a dictionary 
train_dict = df_train[categorical + numerical].to_dict(orient='records')

# Create a new instance of the DictVectorizer class without sparse
dv = DictVectorizer(sparse=False)

# Use the method fit and first we train our DictVectorizer
dv.fit(train_dict)

# Use the function transform with our DictVectorizer
X_train = dv.transform(train_dict)

In [29]:
# Create a model logistic regression
model = LogisticRegression(solver='liblinear', random_state=1)

# For training the model we use the fit method
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [30]:
# Create validation dictionary with categorical and numerical variables
val_dict = df_val[categorical + numerical].to_dict(orient='records')

# Use the function transform with our validation dictionary
X_val = dv.transform(val_dict)

# Apply our model on X validation and use the first column
y_pred = model.predict_proba(X_val)[:, 1]

In [31]:
# Create a small subset from the dataframe
small_subset = ['contract', 'tenure', 'totalcharges']

# Turn the data into a dictionary and we want each row to turn into a dictionary 
train_dict_small = df_train[small_subset].to_dict(orient='records')

# Create a new instance of the DictVectorizer class without sparse
dv_small = DictVectorizer(sparse=False)

# Use the method fit and first we train our DictVectorizer
dv_small.fit(train_dict_small)

# Use the function transform with our DictVectorizer
X_small_train = dv_small.transform(train_dict_small)

# Create a model logistic regression
model_small = LogisticRegression(solver='liblinear', random_state=1)

# Use the method fit and first we train our DictVectorizer
model_small.fit(X_small_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [32]:
# Turn the data into a dictionary and we want each row to turn into a dictionary 
val_dict_small = df_val[small_subset].to_dict(orient='records')

# Use the function transform with our DictVectorizer
X_small_val = dv_small.transform(val_dict_small)

# Apply our model on Y predition and use the first column
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]

In [33]:
# Accuracy:

# Apply our model on Y predition and use the first column
y_pred = model.predict_proba(X_val)[:, 1]

# Store the Y prediction has 50% or more
churn = y_pred >= 0.5

# Calculate the percentange of the churn using the mean function
(churn == y_val).mean()

0.8016129032258065

In [34]:
# Accuracy and dummy model:

# Evaluate the model on different thresholds
# Check the accuracy of dummy baselines

# Check how many customers we have with Y validation dataset
len(y_val)

1860

In [36]:
# we have 1.860 customers and for all this customers we will make a prediction
# each customer we assign a score and then we make a decision
# some of this decisions are correct and some of this decisions are incorrect

# Check how many decisions are correct
(y_val == churn).sum()

1491

In [None]:
# so have 1.491 customers with the correct decision

# some of our decisions are not correct and we calculate doing:
# total of correct decisions or predictions divide by the total of customers
# in our case is 80%

In [43]:
print(str(round((1491 / 1860) * 100)) + '%')

80%


In [44]:
# Use the function linspace to generate numbers from 0 to 1 with size of the array 21 elements
thresholds = np.linspace(0, 1, 21)
thresholds

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

In [45]:
# For eah of the values above we can treat as a threshold
for t in thresholds:

  # Store the Y prediction we use t threshold list intead of has 50% or more
  # churn = y_pred >= 0.5 <-- THIS LINE OF CODE IT WAS BEFORE
  # NEW LINE OF CODE:
  churn_decision = (y_pred >= t)

  # Check how many decisions are correct
  (y_val == churn_decision).mean()