## Machine Learning Project: Customer Churn Predictions

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Loading and displaying data from Telco Customer Churn (https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

In [2]:
data_file = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(data_file.head())

# Check column names and data types
print(data_file.info())

# Summary statistics
print(data_file.describe())

# Check for missing values
print(data_file.isnull().sum())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

### Convert categorical columns into numbers
Using LabelEncoder

In [3]:
x = data_file.drop("Churn", axis=1)  # Drop the 'Churn' column from features (X)
y = data_file["Churn"]  # 'Churn' column is the target variable (y)
gender = pd.get_dummies(data_file['gender'], drop_first=True)  # Convert categorical columns into numbers
encoder = LabelEncoder()
y=encoder.fit_transform(y)
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService","OnlineSecurity","DeviceProtection",
                      "TechSupport","StreamingTV","StreamingMovies","PaperlessBilling","MultipleLines","Contract",
                      "PaymentMethod","OnlineBackup"]  # Add all categorical column names here

for col in categorical_columns:
    x[col] = encoder.fit_transform(x[col])  # Encode all categorical columns
x["TotalCharges"] = pd.to_numeric(x["TotalCharges"], errors="coerce")

x = x.drop("customerID", axis=1)
x["TotalCharges"].fillna(0, inplace=True)

x = pd.get_dummies(x, columns=["InternetService", "Contract", "PaymentMethod"], drop_first=True).astype(int)

### Set train and test variables and implement the Logistic Regression model
Adapting the maximum iterations depinding on the computing capacity.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

NameError: name 'accuracy_score' is not defined

### Report on the accuracy and precision of the model

In [None]:
# Get confusion matrix
cm = confusion_matrix(y_test, model.predict(X_test))
print("Confusion Matrix:\n", cm)

# Get classification report (precision, recall, F1-score)
report = classification_report(y_test, model.predict(X_test))
print("Classification Report:\n", report)


### Inject the Predicted Churn into the inital data set

In [None]:
y_pred_mapped = np.where(y_pred == 0, "No", "Yes")
data_file.loc[X_test.index, 'PredictedChurn'] = y_pred_mapped
print(data_file)