In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

In [10]:
df = pd.read_csv('Telco-Customer-Churn.csv')

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
df.shape

(7043, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [None]:
df.duplicated().sum()

In [12]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [13]:
df = df.drop(columns=['customerID'])

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].isna().sum()

np.int64(11)

In [14]:
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [15]:
X = df.drop(columns=['Churn'])
y = df['Churn']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [17]:
for col in df.columns:
    print(f"{col} - {df[col].nunique()}")

gender - 2
SeniorCitizen - 2
Partner - 2
Dependents - 2
tenure - 73
PhoneService - 2
MultipleLines - 3
InternetService - 3
OnlineSecurity - 3
OnlineBackup - 3
DeviceProtection - 3
TechSupport - 3
StreamingTV - 3
StreamingMovies - 3
Contract - 3
PaperlessBilling - 2
PaymentMethod - 4
MonthlyCharges - 1585
TotalCharges - 6530
Churn - 2


In [18]:
for col in df.columns:
    print(f"{col} - {df[col].dtype}")

gender - object
SeniorCitizen - int64
Partner - object
Dependents - object
tenure - int64
PhoneService - object
MultipleLines - object
InternetService - object
OnlineSecurity - object
OnlineBackup - object
DeviceProtection - object
TechSupport - object
StreamingTV - object
StreamingMovies - object
Contract - object
PaperlessBilling - object
PaymentMethod - object
MonthlyCharges - float64
TotalCharges - float64
Churn - object


In [19]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [25]:
categorical_cols

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

numeric_cols_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_cols_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_cols_transformer, numeric_cols),
        ('cat', categorical_cols_transformer, categorical_cols)
    ], remainder='drop')

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=50, max_depth=7, min_samples_split=5))
])

In [27]:
dt = preprocessor.fit_transform(x_train)

In [None]:
from sklearn.metrics import accuracy_score

rf_pipeline.fit(x_train, y_train)
y_pred = rf_pipeline.predict(x_train)
print(f"Train Accuracy: {accuracy_score(y_train, y_pred)}")

y_pred = rf_pipeline.predict(x_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")


In [None]:
with open('rf_pipeline.pkl', 'wb') as f:
    pickle.dump(rf_pipeline, f)

In [5]:
import numpy as np

with open("D:\\AI ML\\AI ML Projects\\ML Ops Project\\Telecom-Customer-Churn-MLOps\\artifact\\06_14_2025_20_46_57\\data_transformation\\transformed\\train.npy", 'rb') as f:
    data = np.load(f, allow_pickle=True)

In [6]:
data

array([[2.2709783480567705, 1.4732203180753647, -1.1853309419706208, ...,
        'Credit card (automatic)', 29.8, 2134.3],
       [-0.4403388525723636, 1.35290956479605, 1.359793804623659, ...,
        'Credit card (automatic)', 106.05, 6981.35],
       [-0.4403388525723636, 1.2325988115167352, -1.4957527143421199,
        ..., 'Mailed check', 20.5, 1328.15],
       ...,
       [-0.4403388525723636, 1.5935310713546795, 1.5166736250694703, ...,
        'Electronic check', 110.75, 7751.7],
       [-0.4403388525723636, -1.0132019163638073, 0.32839243190545275,
        ..., 'Electronic check', 75.15, 525.0],
       [2.2709783480567705, 1.5935310713546795, 1.4949774796886666, ...,
        'Bank transfer (automatic)', 110.1, 7746.7]],
      shape=(1761, 65), dtype=object)

In [7]:
type(data)

numpy.ndarray

In [32]:
data[0]

array([2.2709783480567705, 1.4732203180753647, -1.1853309419706208,
       -0.08248575535860293, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0,
       1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,
       1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
       0.0, 1.0, 0.0, 1.0, 0.0, 0.0, '9309-BZGNT', 'Male', 1, 'Yes', 'No',
       69, 'No', 'No phone service', 'DSL', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'One year', 'Yes', 'Credit card (automatic)', 29.8, 2134.3],
      dtype=object)

In [28]:
type(dt)

numpy.ndarray

In [30]:
len(dt[0])

45