<a href="https://colab.research.google.com/github/bright4mp/new_file/blob/master/Copy_of_Stage_C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# importing all the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0

In [3]:
df=pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [4]:
# Convert 'TotalCharges' to numeric, coercing errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
# Check for any conversion issues
print(df['TotalCharges'].isna().sum())

11


In [6]:
# Fill missing values with 0
df['TotalCharges'].fillna(0, inplace=True)

In [7]:
# Verify the changes
print(df['TotalCharges'].dtype)
print(df['TotalCharges'].isna().sum())

float64
0


In [8]:
# Display the first few rows to confirm the changes
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1

In [9]:
# Convert the 'Churn' column to binary values
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [10]:
# Verify the changes
print(df['Churn'].head())
print(df['Churn'].unique())

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64
[0 1]


In [11]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

X = df[categorical + numerical]
y = df['Churn']

In [14]:
#dividing the data (80% train, 20% test) into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train[numerical]), columns=numerical, index=X_train.index)
X_test_num_scaled = pd.DataFrame(scaler.transform(X_test[numerical]), columns=numerical, index=X_test.index)

In [16]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_cat_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical]), columns=encoder.get_feature_names_out(categorical), index=X_train.index)
X_test_cat_encoded = pd.DataFrame(encoder.transform(X_test[categorical]), columns=encoder.get_feature_names_out(categorical), index=X_test.index)

In [17]:
X_train_processed = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_test_processed = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)

In [18]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [19]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_processed, y_train)
y_pred_rf = rf.predict(X_test_processed)
print("Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Classifier
Accuracy: 0.7963094393186657
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1036
           1       0.66      0.47      0.55       373

    accuracy                           0.80      1409
   macro avg       0.75      0.69      0.71      1409
weighted avg       0.78      0.80      0.78      1409



In [20]:
et = ExtraTreesClassifier(random_state=1)
et.fit(X_train_processed, y_train)
y_pred_et = et.predict(X_test_processed)
print("Extra Trees Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_et))
print(classification_report(y_test, y_pred_et))

Extra Trees Classifier
Accuracy: 0.7771469127040455
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1036
           1       0.61      0.45      0.52       373

    accuracy                           0.78      1409
   macro avg       0.71      0.67      0.69      1409
weighted avg       0.76      0.78      0.77      1409



In [21]:
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train_processed, y_train)
y_pred_xgb = xgb.predict(X_test_processed)
print("XGBoost Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost Classifier
Accuracy: 0.794889992902768
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1036
           1       0.64      0.52      0.57       373

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409



In [22]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(X_train_processed, y_train)
y_pred_lgbm = lgbm.predict(X_test_processed)
print("LightGBM Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print(classification_report(y_test, y_pred_lgbm))

[LightGBM] [Info] Number of positive: 1496, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265531 -> initscore=-1.017418
[LightGBM] [Info] Start training from score -1.017418
LightGBM Classifier
Accuracy: 0.8076650106458482
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1036
           1       0.67      0.54      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

