In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,classification_report

# Load the dataset
df=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv',index_col=False)
df=df.drop(columns='customerID',axis=1)

In [2]:
label_encoder=LabelEncoder()

In [3]:
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')
df['MonthlyCharges']=pd.to_numeric(df['MonthlyCharges'],errors='coerce')
df['tenure']=pd.to_numeric(df['tenure'],errors='coerce')

In [4]:
labels=['gender','Partner','Dependents','PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges','Churn','Contract']

for column in labels:
    df[column] = label_encoder.fit_transform(df[column])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int64  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   MultipleLines     7043 non-null   int64  
 7   InternetService   7043 non-null   int64  
 8   OnlineSecurity    7043 non-null   int64  
 9   OnlineBackup      7043 non-null   int64  
 10  DeviceProtection  7043 non-null   int64  
 11  TechSupport       7043 non-null   int64  
 12  StreamingTV       7043 non-null   int64  
 13  StreamingMovies   7043 non-null   int64  
 14  Contract          7043 non-null   int64  
 15  PaperlessBilling  7043 non-null   int64  
 16  PaymentMethod     7043 non-null   int64  


In [6]:
# Feature Scaling

scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [7]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,-1.277445,0,1,0,0,2,0,0,0,0,0,1,2,-1.131766,-0.994194,0
1,1,0,0,0,0.066327,1,0,0,2,0,2,0,0,0,1,0,3,-0.387740,-0.173740,0
2,1,0,0,0,-1.236724,1,0,0,2,2,0,0,0,0,0,1,3,-0.517317,-0.959649,1
3,1,0,0,0,0.514251,0,1,0,2,0,2,2,0,0,1,0,0,-0.872611,-0.195248,0
4,0,0,0,0,-1.236724,1,0,1,0,0,0,0,0,0,0,1,2,0.095041,-0.940457,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,-0.340876,1,2,0,2,0,2,2,2,2,1,1,3,0.642612,-0.129180,0
7039,0,0,1,1,1.613701,1,2,1,0,2,2,0,2,2,1,1,1,1.372008,2.241056,0
7040,0,0,1,1,-0.870241,0,1,0,2,0,0,0,0,0,0,1,2,-1.142216,-0.854514,0
7041,1,1,1,0,-1.155283,1,2,1,0,0,0,0,0,0,0,1,3,0.232979,-0.872095,1


In [8]:
# Feature Interaction
df['Tenure_MonthlyCharges'] = df['tenure'] * df['MonthlyCharges']

In [9]:
# Convert 'TotalCharges' column to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Feature Transformation (Example: Logarithmic transformation)
df['TotalCharges_Log'] = np.log1p(df['TotalCharges'])  # Apply log transformation using numpy's log1p function


In [10]:
# Binning (Example: Binning tenure into different groups)
bins = [0, 12, 24, 36, 48, 60, 72, float('inf')]
labels = ['0-12', '13-24', '25-36', '37-48', '49-60', '61-72', '72+']
df['tenure_bins'] = pd.cut(df['tenure'], bins=bins, labels=labels, right=False)

In [11]:
# Domain-specific Feature Engineering (Example: Average usage of services)
service_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df['Average_Usage'] = df[service_columns].mean(axis=1)


In [12]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_MonthlyCharges,TotalCharges_Log,tenure_bins,Average_Usage
0,0,0,1,0,-1.277445,0,1,0,0,2,...,0,1,2,-1.131766,-0.994194,0,1.445769,-5.148879,,0.333333
1,1,0,0,0,0.066327,1,0,0,2,0,...,1,0,3,-0.387740,-0.173740,0,-0.025718,-0.190846,0-12,0.666667
2,1,0,0,0,-1.236724,1,0,0,2,2,...,0,1,3,-0.517317,-0.959649,1,0.639779,-3.210142,,0.666667
3,1,0,0,0,0.514251,0,1,0,2,0,...,1,0,0,-0.872611,-0.195248,0,-0.448741,-0.217221,0-12,1.000000
4,0,0,0,0,-1.236724,1,0,1,0,0,...,0,1,2,0.095041,-0.940457,1,-0.117540,-2.821064,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,-0.340876,1,2,0,2,0,...,1,1,3,0.642612,-0.129180,0,-0.219051,-0.138320,,1.666667
7039,0,0,1,1,1.613701,1,2,1,0,2,...,1,1,1,1.372008,2.241056,0,2.214012,1.175899,0-12,1.333333
7040,0,0,1,1,-0.870241,0,1,0,2,0,...,0,1,2,-1.142216,-0.854514,0,0.994003,-1.927676,,0.333333
7041,1,1,1,0,-1.155283,1,2,1,0,0,...,0,1,3,0.232979,-0.872095,1,-0.269157,-2.056471,,0.000000


In [13]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'Tenure_MonthlyCharges',
       'TotalCharges_Log', 'tenure_bins', 'Average_Usage'],
      dtype='object')

In [14]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_MonthlyCharges,TotalCharges_Log,tenure_bins,Average_Usage
0,0,0,1,0,-1.277445,0,1,0,0,2,...,0,1,2,-1.131766,-0.994194,0,1.445769,-5.148879,,0.333333
1,1,0,0,0,0.066327,1,0,0,2,0,...,1,0,3,-0.387740,-0.173740,0,-0.025718,-0.190846,0-12,0.666667
2,1,0,0,0,-1.236724,1,0,0,2,2,...,0,1,3,-0.517317,-0.959649,1,0.639779,-3.210142,,0.666667
3,1,0,0,0,0.514251,0,1,0,2,0,...,1,0,0,-0.872611,-0.195248,0,-0.448741,-0.217221,0-12,1.000000
4,0,0,0,0,-1.236724,1,0,1,0,0,...,0,1,2,0.095041,-0.940457,1,-0.117540,-2.821064,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,-0.340876,1,2,0,2,0,...,1,1,3,0.642612,-0.129180,0,-0.219051,-0.138320,,1.666667
7039,0,0,1,1,1.613701,1,2,1,0,2,...,1,1,1,1.372008,2.241056,0,2.214012,1.175899,0-12,1.333333
7040,0,0,1,1,-0.870241,0,1,0,2,0,...,0,1,2,-1.142216,-0.854514,0,0.994003,-1.927676,,0.333333
7041,1,1,1,0,-1.155283,1,2,1,0,0,...,0,1,3,0.232979,-0.872095,1,-0.269157,-2.056471,,0.000000


In [15]:
labels=['gender','Partner','Dependents','PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges','Churn','Contract','Average_Usage']

for column in labels:
    df[column] = label_encoder.fit_transform(df[column])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 7043 non-null   int64   
 1   SeniorCitizen          7043 non-null   int64   
 2   Partner                7043 non-null   int64   
 3   Dependents             7043 non-null   int64   
 4   tenure                 7043 non-null   float64 
 5   PhoneService           7043 non-null   int64   
 6   MultipleLines          7043 non-null   int64   
 7   InternetService        7043 non-null   int64   
 8   OnlineSecurity         7043 non-null   int64   
 9   OnlineBackup           7043 non-null   int64   
 10  DeviceProtection       7043 non-null   int64   
 11  TechSupport            7043 non-null   int64   
 12  StreamingTV            7043 non-null   int64   
 13  StreamingMovies        7043 non-null   int64   
 14  Contract               7043 non-null   i

In [17]:
print(f'check isf there is any  missing values in df\n{df.isna().sum()}')

check isf there is any  missing values in df
gender                      0
SeniorCitizen               0
Partner                     0
Dependents                  0
tenure                      0
PhoneService                0
MultipleLines               0
InternetService             0
OnlineSecurity              0
OnlineBackup                0
DeviceProtection            0
TechSupport                 0
StreamingTV                 0
StreamingMovies             0
Contract                    0
PaperlessBilling            0
PaymentMethod               0
MonthlyCharges              0
TotalCharges               11
Churn                       0
Tenure_MonthlyCharges       0
TotalCharges_Log           11
tenure_bins              3775
Average_Usage               0
dtype: int64


In [18]:
df['tenure_bins']

0        NaN
1       0-12
2        NaN
3       0-12
4        NaN
        ... 
7038     NaN
7039    0-12
7040     NaN
7041     NaN
7042    0-12
Name: tenure_bins, Length: 7043, dtype: category
Categories (7, object): ['0-12' < '13-24' < '25-36' < '37-48' < '49-60' < '61-72' < '72+']

In [26]:
# Fill missing values in 'TotalCharges' column with the mean value
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Fill missing values in 'TotalCharges_Log' column with zero
df['TotalCharges_Log'].fillna(0, inplace=True)
most_frequent_category = df['tenure_bins'].mode().values[0]  # Get the most frequent category
df['tenure_bins'].fillna(most_frequent_category, inplace=True)  # Fill missing values with the most frequent category


In [27]:
df['tenure_bins'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: tenure_bins
Non-Null Count  Dtype   
--------------  -----   
7043 non-null   category
dtypes: category(1)
memory usage: 7.4 KB


In [28]:
class_distribution=df['Churn'].value_counts()
print(class_distribution)
imbalance_ratio = max(class_distribution) / min(class_distribution)
print("Imbalance Ratio:", imbalance_ratio)

0    5174
1    1869
Name: Churn, dtype: int64
Imbalance Ratio: 2.7683253076511503


In [35]:
label_encoder = LabelEncoder()
df['tenure_bins_encoded'] = label_encoder.fit_transform(df['tenure_bins'])
df = df.drop('tenure_bins', axis=1)

In [36]:
X=df.drop('Churn',axis=1)
y=df['Churn']

smote=SMOTE(random_state=42)

X_resamples,y_resampled=smote.fit_resample(X,y)
print("the class distribution before resample")
print(y.value_counts())
print("The class distribution after resample")
print("\nClass Distribution After Resampling:")
print(pd.Series(y_resampled).value_counts())

the class distribution before resample
0    5174
1    1869
Name: Churn, dtype: int64
The class distribution after resample

Class Distribution After Resampling:
0    5174
1    5174
Name: Churn, dtype: int64


In [39]:
X_train,X_test,y_train,y_test=train_test_split(X_resamples,y_resampled)

In [45]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
predicted_labels = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predicted_labels)
print("Accuracy:", accuracy)

Accuracy: 0.8465403942790878


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifers=RandomForestClassifier()

In [47]:
feature_importances = rf_classifier.feature_importances_

important_features = df.columns[:-1][feature_importances > 0.01]  # Select features with importance > 0.01


In [55]:
len(df.columns)

24

In [53]:
len(important_features)

21

In [49]:
# Select the final set of features for modeling
selected_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'Tenure_MonthlyCharges', 'TotalCharges_Log']

In [51]:
df_final = df[selected_features]

In [52]:
df_final

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_MonthlyCharges,TotalCharges_Log
0,0,0,1,0,-1.277445,1,0,0,2,0,...,0,0,0,1,2,142,-0.994194,0,1.445769,-5.148879
1,1,0,0,0,0.066327,0,0,2,0,2,...,0,0,1,0,3,498,-0.173740,0,-0.025718,-0.190846
2,1,0,0,0,-1.236724,0,0,2,2,0,...,0,0,0,1,3,436,-0.959649,1,0.639779,-3.210142
3,1,0,0,0,0.514251,1,0,2,0,2,...,0,0,1,0,0,266,-0.195248,0,-0.448741,-0.217221
4,0,0,0,0,-1.236724,0,1,0,0,0,...,0,0,0,1,2,729,-0.940457,1,-0.117540,-2.821064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,-0.340876,2,0,2,0,2,...,2,2,1,1,3,991,-0.129180,0,-0.219051,-0.138320
7039,0,0,1,1,1.613701,2,1,0,2,2,...,2,2,1,1,1,1340,2.241056,0,2.214012,1.175899
7040,0,0,1,1,-0.870241,1,0,2,0,0,...,0,0,0,1,2,137,-0.854514,0,0.994003,-1.927676
7041,1,1,1,0,-1.155283,2,1,0,0,0,...,0,0,0,1,3,795,-0.872095,1,-0.269157,-2.056471
