<a href="https://colab.research.google.com/github/carlos-alves-one/-ML-Zoomcamp-Week-5/blob/main/ML_Zoomcamp_Week_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goldsmiths University of London
**Author....: Carlos Manuel de Oliveira Alves**<br>
**Student..: cdeol003**<br>
**Created..: 03/10/2022**

In [1]:
# Import all necessay libraries that we will use in this project
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Import the library warnings to ignore the warnings from the system
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data Preparation:

# Read the dataset, store it in dataframe
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Make the categorical data of the dataframe consistent 
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in categorical_columns:
  df[col] = df[col].str.lower().str.replace(' ', '_')

# Convert a serie of the dataframe to a number
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

# Fill the missing values of the total charges serie with zeros
df.totalcharges = df.totalcharges.fillna(0)

# Update the churn data with numbers
df.churn = (df.churn == 'yes').astype(int)

In [3]:
# Define the sizes of the datasets with 20% and use random state so the results are reproducible
# the full train dataset has 80% and the test 20%
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [4]:
# Create a list with numerical variables from the dataframe
numerical = ['tenure','monthlycharges','totalcharges']

# Create a list with categorical variables from the dataframe
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
    'phoneservice', 'multiplelines', 'internetservice',
    'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
    'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
    'paymentmethod']

In [5]:
# Create a function for training
def train(data_train, y_train, C=1.0):
    
    # Create dictionaries that extract from our dataframe categoricl and numerical variables
    dicts = data_train[categorical+numerical].to_dict(orient='records')

    # Create a new instance of the DictVectorizer class without sparse
    dv = DictVectorizer(sparse=False)
    
    # Use the function transform with our DictVectorizer
    X_train = dv.fit_transform(dicts)

    # Create a model logistic regression and define the parameter and duration
    model = LogisticRegression(C=C, max_iter=1000)
    
    # For training the model we use the fit method
    model.fit(X_train, y_train)
    
    # Return the DictVectorizer and model
    return dv, model

In [7]:
# Create a function for predict
def predict(data, dv, model):

    # Convert the dataframe into a list of dictionaries
    dicts = data[categorical+numerical].to_dict(orient='records')

    # Creates the feature matrix using the vectorizer
    X = dv.transform(dicts)

    # Use the model predict proba and take the second column
    y_pred = model.predict_proba(X)[:, 1]

    # Return our prediction
    return y_pred

In [8]:
# Define the parameters of the model
C = 1.0
n_splits = 5

In [13]:
# Use the function KFold to split the data in 5 parts and seed 1
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

# Create a list with scores
scores = []

# Iterate over the full train dataframe using the function split together with train and val indexes
for train_idx, val_idx in kfold.split(df_full_train):

    # Use iloc to select a part of the full train dataframe for train
    df_train = df_full_train.iloc[train_idx]

    # Use iloc to select a part of the full train dataframe for validation
    df_val = df_full_train.iloc[val_idx]

    # Use iloc to select a part of the full train dataframe for train and validation
    y_train = df_train.churn.values
    y_val = df_val.churn.values

    # Call function train and store results of dv and model
    dv, model = train(df_train, y_train,C=1)

    # Call the function predict and use with our validation datset
    y_pred = predict(df_val, dv, model)
    
    # Compute and store the ROC AUC score
    auc = roc_auc_score(y_val, y_pred)
    
    # After evaluate the model we store the results
    scores.append(auc)

# Print the mean score and standard deviation
print(f'C={C} {np.mean(scores):.3f} +- {np.std(scores):.3f}')

C=1.0 0.840 +- 0.008
