## Step 1: Install the necessary packages

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.pipeline import make_pipeline
%matplotlib inline

## Step 2: Load the dataset

In [13]:
df=pd.read_csv('Telco-Customer-Churn.csv')

## Step 3: Display first few rows to understand the data set
The data has 21 columns

In [15]:
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

## Step 4:Check for any missing values
 No missing values in this dataset

In [23]:
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


## Step 5:Check the data type of each column
Establish the respective columns data types

In [35]:
print(df.dtypes)

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


## Step 6: Convert totacharges to numeric and handle nonnumeric values, Fill missing totalcharges with the median

In [45]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median())

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

## Step 7:Drop unuseful columns, in this case drop Customer ID as it is not useful

In [55]:
df.drop('customerID',axis=1)

KeyError: "['customerID'] not found in axis"

## Step 8: Encode categorical variables

In [67]:
le=LabelEncoder()
categorical_cols=df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col]=le.fit_transform(df[col])

## Step 9: Exploratory Data Analysis(EDA) churn distribution

In [81]:
#Visualize churn distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Churn',data=df)
plt.title('Churn Distribution')
plt.xlabel('Churn, (0=No, 1=Yes)')
plt.ylabel('Count')
plt.savefig('churn_distribution.png')
plt.close()

#Visualize tenure vs churn
plt.figure(figsize=(8,6))
sns.boxplot(x='Churn',y='tenure',data=df)
plt.title('Tenure vs Churn Distribution')
plt.xlabel('Churn, (0=No, 1=Yes)')
plt.ylabel('Tenure (Months)')
plt.savefig('tenure_vs_churn.png')
plt.close()

#Correlation heatmap, red implies positive correlation and blue negative
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=False,cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('Correlation_heatmap.png')
plt.close()

## Step 10: Prepare data for modeling
1. Create a feature set which includes all columns except `Churn`
2. Set `Churn` as the target variable.
3. Split data into 80% training and 20% testing. `random_state=42` ensures reproducability.
4. Create a logistic regression model by scaling the data using `Standardscaler`allowing 1000 iterations to converge
5. Train the model
6. Predict `Churn` for the test set.
7. Calculate the percentage of correct predictions and print the accuracy
8. Print the detail metrics `precision`, `recall`, `F1-score`

In [115]:
#Prepare for modeling
X=df.drop('Churn',axis=1) #Features
y=df['Churn'] #Target

#split data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#Train logistic regression model
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
model.fit(X_train,y_train)

#Make predictions
y_pred=model.predict(X_test)

#Evaluate the model
Accuracy=accuracy_score(y_test,y_pred)
print("\nModel Accuracy:",Accuracy)
print("\nClassification Report:")
print(classification_report(y_test,y_pred))


Model Accuracy: 0.815471965933286

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.58      0.62       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409



## Step 11:Save processed data in csv file to be used on Power BI

In [113]:
#Save processed data
df.to_csv('processed_Churn_Data.csv',index=False)