In [8]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tqdm.auto import tqdm

In [2]:
DATA_PATH = "../data/telco.csv"
OUTPUT_PATH = "../models/model_reg=%s.bin"

numeric_variables = ["tenure", "monthlycharges", "totalcharges"]
categorical_variables = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

In [3]:
# Read the data from DATA_PATH into a pandas dataframe
df_data = pd.read_csv(DATA_PATH)

In [4]:
# Make all the dataframe's column names lower case and replace spaces with underscores
df_data.columns = [col.lower().replace(" ", "_") for col in df_data.columns]

# Cast the totalcharges column to numeric, coercing all non-numeric values into NaN
df_data.totalcharges = pd.to_numeric(df_data.totalcharges, errors="coerce")
# Fill all missing values in the totalcharges column with 0
df_data.totalcharges = df_data.totalcharges.fillna(0)

# Cast the churn column to numbers
df_data.churn = (df_data.churn == "Yes").astype(int)

In [9]:
# Split the data into full train and test sets
df_full_train, df_test = train_test_split(df_data, test_size=0.2, random_state=42)

# Create a validation set too - 25%
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# Remove the label from the test set
y_test = df_test.pop("churn").values

In [7]:
n_splits: int = 5

# import KFold from the right library
from sklearn.model_selection import KFold

for next_reg_factor in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    # Perform k-fold cross validation for each next_reg_factor
    # Use a constant number of splits, n_splits
    # please
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create a list to store the AUC scores for each fold
    auc_scores = []

    # Iterate over each fold
    for train_idx, val_idx in tqdm(kfold.split(df_train), total=n_splits):
        # Split the training data into training and validation sets
        df_train_kfold = df_train.iloc[train_idx]
        df_val_kfold = df_train.iloc[val_idx]

        # Remove the label from the training and validation sets
        y_train_kfold = df_train_kfold.pop("churn").values
        y_val_kfold = df_val_kfold.pop("churn").values

        # Create a DictVectorizer object and fit it to the training data
        dv = DictVectorizer(sparse=False)
        dv.fit(df_train_kfold.to_dict(orient="records"))

        # Transform the training and validation sets
        X_train_kfold = dv.transform(df_train_kfold.to_dict(orient="records"))
        X_val_kfold = dv.transform(df_val_kfold.to_dict(orient="records"))

        # Create a LogisticRegression object and fit it to the training data
        model = LogisticRegression(C=next_reg_factor, max_iter=1000)
        model.fit(X_train_kfold, y_train_kfold)

        # Calculate the AUC score on the validation data and append it to auc_scores
        y_pred_kfold = model.predict_proba(X_val_kfold)[:, 1]
        auc_scores.append(roc_auc_score(y_val_kfold, y_pred_kfold))

array([1.00000000e-03, 3.16227766e-02, 1.00000000e+00, 3.16227766e+01,
       1.00000000e+03])