In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [14]:
# Load dataset
df = pd.read_csv('telco.csv')

In [15]:
# Check for missing values
df.isnull().sum()

Customer ID                             0
Gender                                  0
Age                                     0
Under 30                                0
Senior Citizen                          0
Married                                 0
Dependents                              0
Number of Dependents                    0
Country                                 0
State                                   0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Population                              0
Quarter                                 0
Referred a Friend                       0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                  

In [16]:
drop_cols = ['Customer ID', 'Country', 'State', 'Churn Category', 'Churn Reason', 'Customer Status']
df = df.drop(columns=drop_cols, errors='ignore')

In [17]:
# Convert categorical features into numerical using Label Encoding
label_enc = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])

In [18]:
# Check for missing values and fill them with the median value
df.fillna(df.median(numeric_only=True), inplace=True)

In [19]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Churn Label'], errors='ignore')  # Features
y = df['Churn Label']  # Target (Churned = 1, Not Churned = 0)

In [20]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [22]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [23]:
print(f"Accuracy = {accuracy}")
print(conf_matrix)
print(classification_rep)

Accuracy = 0.9779985805535841
[[1006    3]
 [  28  372]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1009
           1       0.99      0.93      0.96       400

    accuracy                           0.98      1409
   macro avg       0.98      0.96      0.97      1409
weighted avg       0.98      0.98      0.98      1409

