<a href="https://colab.research.google.com/github/alerodriguessf/predicting-churn-in-streaming-service/blob/main/Portfolio_Predicting_Churn_in_Streaming_Service_Logistic_Regression_20250117.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Predicting Churn in Streaming Service using Logistic Regression


In [None]:
# Import fundamental libraries for data analysis
!pip install ydata-profiling
!pip install --upgrade numba
!pip install sidetable

import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import sidetable
import missingno as msno
from ipywidgets import interact, widgets

from sklearn import datasets
from sklearn.preprocessing import scale, minmax_scale, power_transform

In [None]:
# Upload the challenge file

from google.colab import files

uploaded = files.upload()

## Step 01: Exploratory Data Analysis (Data Understanding)
1. Load the dataset;
2. Perform a statistical description of the data;
3. Check the data types;
4. Check the amount of missing values.


In [None]:
# Load the main dataset

df = pd.read_excel('streaming_data (1).xlsx')

In [None]:
df.profile_report()

In [None]:
df.info()

In [None]:
df.isna().sum()

## Step 02: Data Preparation
1. Replace "NaN" values with 0 in specific columns;
2. Remove rows with null values in essential columns;
3. Map "churned" values to "No" and "Yes";
4. Convert float columns to integers.

In [None]:
# Replace missing values with 0 in specific columns

columns_replace= ['Time_on_platform', 'Num_streaming_services', 'Churned', 'Avg_rating', 'Devices_connected']
df[columns_replace] = df[columns_replace].fillna(0)

In [None]:
# Remove rows with missing values in essential columns

df.dropna(subset=['Gender', 'Subscription_type', 'Age'], inplace=True)

In [None]:
# Transform "Churned" values from 0/1 to "No"/"Yes"

df['Churned'] = df['Churned'].replace({0: 'No', 1: 'Yes'})


df.head(10)

In [None]:
# Convert float columns to integers

df['Age'] = df['Age'].astype(int)
df['Time_on_platform'] = df['Time_on_platform'].astype(int)
df['Num_streaming_services'] = df['Num_streaming_services'].astype(int)
df['Avg_rating'] = df['Avg_rating'].astype(int)
df['Devices_connected'] = df['Devices_connected'].astype(int)

# Check dataset information after data preparation

df.info()

In [None]:
# Explore possible outliers in the data using boxplot

from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')



df[numeric_cols].apply(scale).plot.box()

plt.xticks(rotation=90, ha='right');

In [None]:
# Analyse correlations between numerical variables
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


numerical_df = df.select_dtypes(include=['number'])

correlation_matrix = numerical_df.corr()

sns.heatmap(correlation_matrix, annot=True)

# Display the plot
plt.show()

## Step 03: Data Modelling - Logistic Regression
1. Define X (features) and y (target) variables;
2. Scale the data using MinMaxScaler;
3. Split the dataset into training and testing sets;
4. Fit the Logistic Regression model;
5. Evaluate the model using a confusion matrix and classification metrics.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
# Define independent (X) and dependent (y) variables

x = df[['Age','Devices_connected','Time_on_platform','Num_streaming_services','Avg_rating']]
y = df['Churned'].replace({'No': 0, 'Yes': 1})

In [None]:
# Scale the data
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Fit Logistic Regression model

log_model = LogisticRegression(random_state = 42)
log_model.fit(x_train, y_train)

In [None]:
# Make predictions

y_pred = log_model.predict(x_test)

# Display confusion matrix and classification report

conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=log_model.classes_)
disp.plot(cmap = 'viridis')

print(classification_report(y_test, y_pred))

## Step 04: Model Optimisation with Grid Search
1. Use GridSearchCV to find the best parameters;
2. Train the optimised model;
3. Evaluate the performance of the optimized model.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression

param_grid = {
 'C':[0.1,1,10,100],
 'solver' : ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(x_train, y_train)

# Best model found by Grid Search

best_model = grid_search.best_estimator_
print(f'best parameters: {grid_search.best_params_}')



In [None]:
# Evaluate optimised model

y_pred = best_model.predict(x_test)

conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['No','Yes'])
disp.plot(cmap = 'viridis')

print(classification_report(y_test, y_pred))

## Step 05: Modelling with Random Forest
1. Train a Random Forest model;
2. Optimise hyperparameters using Grid Search;
3. Evaluate the model with a confusion matrix and metrics.


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train)

# Make predictions with Random Forest
y_pred_rf = rf_model.predict(x_test)

# Display confusion matrix for Random Forest
ConfusionMatrixDisplay.from_estimator(rf_model,x_test, y_test)
plt.show()

# Display classification report for Random Forest
print(classification_report(y_test, y_pred_rf))


In [None]:
# Optimise hyperparameters for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 10]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
rf_grid_search.fit(x_train, y_train)

# Best Random Forest model found by Grid Search

best_rf_model = rf_grid_search.best_estimator_

y_pred_best_rf = best_rf_model.predict(x_test)

# Evaluate optimized model

ConfusionMatrixDisplay.from_estimator(best_rf_model, x_test, y=y_test)
plt.show()

In [None]:
print("Best parameters:", rf_grid_search.best_params_)
print(classification_report(y_test, y_pred_best_rf))


In [None]:
# Create a DataFrame for the user input
user_data = pd.DataFrame({
    'Age': [12],
    'Devices_connected': [4],
    'Time_on_platform': [1],
    'Num_streaming_services': [2],
    'Avg_rating': [1]
})

# Scale the user input data using the trained scaler
user_data_scaled = scaler.transform(user_data)

# Predict the probability of churn
churn_probability = best_model.predict_proba(user_data_scaled)[0][1]

# Display the result
print(f"The predicted probability of churn for this user is: {churn_probability:.3%}")

