# Churn prediction project

Logistic regression: g(xi)=SIGMOID(w0+wTxi) → outputs a number 0..1∈R
sigmoid(z)= 1 / (1+exp(−z))


# Data preparation


In [None]:
url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
#!curl -o churn.csv $url

In [None]:
import pandas
import numpy

# import sklearn

In [None]:
df = pandas.read_csv("../../data/raw/churn.csv")
df.head()

In [None]:
df.columns

In [None]:
# read all columns with their data (transpose col to row)
df.head().T

In [None]:
df.columns = df.columns.str.lower()
cols = df.dtypes[df.dtypes == "object"].index
df.head()

for col in cols:
    df[col] = df[col].str.lower().str.replace(" ", "_")

df.head()

In [None]:
df.dtypes

In [None]:
# change the object feild to num and fix the col err value
print(df.totalcharges[df.totalcharges == "_"])

df.totalcharges = pandas.to_numeric(df.totalcharges, errors="coerce")

print(df.totalcharges.iloc[488])
df.totalcharges = df.totalcharges.fillna(0)

df.totalcharges[df.totalcharges == 0]

In [None]:
print(df.churn.head())

df.churn = df.churn == "yes"
df.churn = df.churn.astype(int)

df.churn.head()

# Setting Up The Validation Framework

perform the train/validation/test split with scikit-learn


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

In [None]:
# 80/20 = 1/4 = 0.25
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_test), len(df_val)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [None]:
del df_train["churn"]
del df_val["churn"]
del df_test["churn"]

In [None]:
df_train.head()

# EDA (Exploratory data analysis)

Checking missing values
Looking at the distribution of the target variable (churn)
Looking at numerical and categorical variables


In [None]:
df_full_train.isnull().sum()

In [None]:
print(df_full_train.churn.value_counts())
# the % of churn data than unchurn
df_full_train.churn.value_counts(normalize=True)

In [None]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

In [None]:
# check the category var
print("*df_full_train.dtypes:\n", df_full_train.dtypes)

numerical = ["tenure", "monthlycharges", "totalcharges"]

print("*df_full_train.columns:\n", df_full_train.columns)
categorical = [
    "gender",
    "seniorcitizen",
    "partner",
    "dependents",
    "phoneservice",
    "multiplelines",
    "internetservice",
    "onlinesecurity",
    "onlinebackup",
    "deviceprotection",
    "techsupport",
    "streamingtv",
    "streamingmovies",
    "contract",
    "paperlessbilling",
    "paymentmethod",
]

In [None]:
df_full_train[categorical].nunique()

# Churn Rate

(the difference and the risk ratio as two important tools for assessing feature importance)
global - group
negative result <0 : the group is more likely to churn
positive result >0 : the group is less likely to churn


In [None]:
# find which feature affect more the churn

churn_female = df_full_train[df_full_train.gender == "female"].churn.mean()
print(churn_female)
churn_male = df_full_train[df_full_train.gender == "male"].churn.mean()
print(churn_male)

print("")
# no affect of this feature
print(global_churn_rate - churn_female)
print(global_churn_rate - churn_male)

In [None]:
print(df_full_train.partner.value_counts())
print("")

churn_partner = df_full_train[df_full_train.partner == "yes"].churn.mean()
print(churn_partner)
churn_no_partner = df_full_train[df_full_train.partner == "no"].churn.mean()
print(churn_no_partner)

print("")

print(global_churn_rate - churn_partner)
# the result is negative (most likely the churn)
print(global_churn_rate - churn_no_partner)

# Risk Ratio

(measure the important of features)
the group has result >1 is more likely to churn
the group has result <1 is less likely to churn


In [None]:
print(churn_partner / global_churn_rate)
print(churn_no_partner / global_churn_rate)

Let’s take the data and group it by gender, and for each variable within the gender group, let’s calculate the average churn rate within that group and calculate the difference and risk. We can perform this analysis for all the variables, not just the gender variable.

The SQL query would look like:

SELECT
gender,
AVG(churn),
AVG(churn) - global_churn AS diff,
AVG(churn) / global_churn AS risk
FROM
date
GROUP BY
gender;


df.groupby('x').y.agg([mean()]) - returns a dataframe with mean of y series grouped by x series


In [None]:
from IPython.display import display

for c in categorical:
    df_group = df_full_train.groupby(c).churn.agg(["mean", "count"])
    df_group["diff"] = df_group["mean"] - global_churn_rate
    df_group["risk"] = df_group["mean"] / global_churn_rate
    display(df_group)
    print()

# Feature importance: Mutual information

(how much do we learn about churn if we have the information from a particular feature. So, it is a measure of the importance of a categorical variable)


In [None]:
from sklearn.metrics import mutual_info_score

print(mutual_info_score(df_full_train.churn, df_full_train.contract))
print("contract", mutual_info_score(df_full_train.contract, df_full_train.churn))

# we learn nothing about churn if we know the gender
print("gender", mutual_info_score(df_full_train.churn, df_full_train.gender))

print("partner", mutual_info_score(df_full_train.churn, df_full_train.partner))

In [None]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)


mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

# Feature importance: Correlation

(Correlation coefficient: measures the degree of dependency between two variables. This value is negative if one variable grows while the other decreases, and it is positive if both variables increase. Depending on its size, the dependency between both variables could be low, moderate, or strong. It allows measuring the importance of numerical variables)


In [None]:
# returns the correlation between x and y series. This is a function from pandas.
df_full_train[numerical].corrwith(df_full_train.churn)
# when increase the tenure lead to decrease the churn
# when increase the monthlycharges lead to increase the churn

In [None]:
# 2 months
print(df_full_train[df_full_train.tenure <= 2].churn.mean())

print(df_full_train[df_full_train.tenure > 2].churn.mean())

print(
    df_full_train[
        (df_full_train.tenure > 2) & (df_full_train.tenure <= 12)
    ].churn.mean()
)

print(df_full_train[df_full_train.tenure > 12].churn.mean())

In [None]:
# what the mean of churn when values of feature is <cond>
print(df_full_train[df_full_train.monthlycharges <= 20].churn.mean())

print(
    df_full_train[
        (df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges <= 50)
    ].churn.mean()
)

print(df_full_train[df_full_train.monthlycharges > 50].churn.mean())

# One-Hot Encoding

(encode categorical features to binary)


In [None]:
# create new feilds of one column depends on the nunique (not numeric) value have ,then fill it will 0,1
# if the value of col is num ,will leave it like it is
from sklearn.feature_extraction import DictVectorizer

# ex.
# print(df_train[["gender", "contract"]].iloc[:50])
# train_dicts = df_train[["gender", "contract"]].iloc[:50].to_dict(orient="records")
# train_dicts

In [None]:
# print(df_train[categorical + numerical])
train_dicts = df_train[categorical + numerical].to_dict(orient="records")
train_dicts

dv = DictVectorizer(sparse=False)

# init dv and transform the features
X_train = dv.fit_transform(train_dicts)
print(dv.feature_names_)
print(X_train.shape)
print(X_train[0])

# val transform
val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

# Logistic Regression

(same formula for linear reg but the result is from 0 to 1 , not from -infinty to +infinty)


In [None]:
def sigmoid(z):
    return 1 / (1 + numpy.exp(-z))

In [None]:
# ex.
z = numpy.linspace(-7, 7, 51)
print(z)
sigmoid(z)

In [None]:
import matplotlib.pyplot as plt

plt.plot(z, sigmoid(z))

In [None]:
# ex
# def logistic_regression(xi):
#     score = w0

#     for j in range(len(w)):
#         score = score + xi[j] * w[j]

#     result = sigmoid(score)
#     return result

# Training Logistic Regression with Scikit-Learn


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
# training the logistic regression model
model.fit(X_train, y_train)

In [None]:
# return the bias or intercept of the LR model
model.intercept_[0]

In [None]:
# return the coefficients or weights of the LR model
model.coef_[0].round(3)

In [None]:
# make predictions on the x dataset by returning two columns with their probabilities for the two categories - soft predictions
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

In [None]:
churn_decision = y_pred >= 0.5
# to send the email for these customer ids
df_val[churn_decision].customerid

In [None]:
# validating the model
(y_val == churn_decision).mean()

# Model interpretation


In [None]:
# return the bias or intercept of the LR model
model.intercept_[0]

In [None]:
# return the coefficients or weights of the LR model
model.coef_[0].round(3)