# Step 1: Import packages

In [1]:
import numpy as np
import pandas as pd


# Step 2: Load Dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

Path to dataset files: /home/catpc/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1


In [3]:
import os
dataset_path = '/home/catpc/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1'
files  = os.listdir(dataset_path)
files

['WA_Fn-UseC_-Telco-Customer-Churn.csv']

In [4]:
df = pd.read_csv('/home/catpc/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()         

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Step 3 : Data Preprocessing 
Perform data preprocessing tasks such as handling missing values , encoding categorical variables, and feature scaling

In [5]:
df.shape

(7043, 21)

In [6]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

# Keeping import columns
columns_to_keep = ['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges','Churn']

In [7]:
columns_to_keep = ['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges','Churn']

In [8]:
df = df[columns_to_keep]


In [9]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,Month-to-month,29.85,No
1,Male,0,No,No,34,Yes,No,One year,1889.5,No
2,Male,0,No,No,2,Yes,No,Month-to-month,108.15,Yes
3,Male,0,No,No,45,No,No phone service,One year,1840.75,No
4,Female,0,No,No,2,Yes,No,Month-to-month,151.65,Yes


# Encoder binary variables (e.g. YES/NO columns)
binary_columns = ['Partner','Dependents','PhoneService','Churn']

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder()


In [12]:
#df['gender'] = label_encoder.fit_transform(df['gender'])
#not a good pactice

In [13]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,Month-to-month,29.85,No
1,Male,0,No,No,34,Yes,No,One year,1889.5,No
2,Male,0,No,No,2,Yes,No,Month-to-month,108.15,Yes
3,Male,0,No,No,45,No,No phone service,One year,1840.75,No
4,Female,0,No,No,2,Yes,No,Month-to-month,151.65,Yes


In [14]:
categorical_cols = ['gender','Partner','Dependents','PhoneService','MultipleLines','Contract','Churn']

In [15]:
for col in categorical_cols:
   df[col] = label_encoder.fit_transform(df[col])
    
    

In [16]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,1,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1


# Jugaru Techniques

In [17]:
# binary_columns = ['Partner','Dependents','PhoneService','Churn']
# df[binary_columns] =  df[binary_columns].replace({'Yes':1,'No':0}).infer_objects(copy=False)

In [18]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,1,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1


In [19]:
#df['gender'] = df['gender'].replace({'Female':0,'Male':1})

In [20]:
df['gender'].head()

0    0
1    1
2    1
3    1
4    0
Name: gender, dtype: int64

# Split the dataset into Training and testing sets

In [21]:
x = df.drop('Churn', axis = 1)
y = df['Churn']

In [22]:
#split data into training and testing sets
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test, y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [24]:
x_train.shape, y_train.shape

((5634, 9), (5634,))

In [25]:
x_test.shape,y_test.shape

((1409, 9), (1409,))

In [26]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5634 entries, 2142 to 860
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         5634 non-null   int64 
 1   SeniorCitizen  5634 non-null   int64 
 2   Partner        5634 non-null   int64 
 3   Dependents     5634 non-null   int64 
 4   tenure         5634 non-null   int64 
 5   PhoneService   5634 non-null   int64 
 6   MultipleLines  5634 non-null   int64 
 7   Contract       5634 non-null   int64 
 8   TotalCharges   5634 non-null   object
dtypes: int64(8), object(1)
memory usage: 440.2+ KB


In [27]:
#convert TotalChange column to float , and 
# handle errors='coerce' to replace no-numeric values with NaN

x_train['TotalCharges'] = pd.to_numeric(x_train['TotalCharges'], errors='coerce')
x_test['TotalCharges'] = pd.to_numeric(x_test['TotalCharges'], errors='coerce')

In [28]:
#Replace missing values in the "TotalChanges" column with the mean of the column

In [29]:
x_train.isnull().sum()

gender            0
SeniorCitizen     0
Partner           0
Dependents        0
tenure            0
PhoneService      0
MultipleLines     0
Contract          0
TotalCharges     10
dtype: int64

In [30]:
#replace missing values in the 'TotalChange' column with the mean of the column
x_train['TotalCharges'] = x_train['TotalCharges'].fillna(x_train['TotalCharges'].mean())


In [31]:
x_train.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
dtype: int64

In [32]:
x_test.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     1
dtype: int64

In [33]:
x_test['TotalCharges'] = x_test['TotalCharges'].fillna(x_test['TotalCharges'].mean())

In [34]:
x_test.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
dtype: int64

# Standardize features (optional but often beneficial for logistic regression)

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [37]:
x_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.42210502],
       [-1.02516569, -0.4377492 , -0.96957859, ...,  1.10833901,
         1.5775905 ,  1.25536015],
       [ 0.97545208, -0.4377492 ,  1.03137591, ...,  0.05390099,
        -0.83177379, -1.00299144],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -1.00053704,
        -0.83177379, -0.87799925],
       [ 0.97545208,  2.28441306, -0.96957859, ...,  1.10833901,
        -0.83177379, -0.48254445],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.81110232]])

# Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
lg  = LogisticRegression()

In [41]:
lg.fit(x_train,y_train)
y_pred = lg.predict(x_test)

In [42]:
y_pred

array([1, 0, 0, ..., 0, 0, 1])

# Accuracy score

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7757274662881476

# save model

In [44]:
import pickle 
pickle.dump(lg,open('7_logistic_model.pkl', 'wb'))

# classification system

In [70]:
def predictive(gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges):
    data = {
        'gender':[gender],
        'SeniorCitizen':[SeniorCitizen],
        'Partner':[Partner],
        'Dependents':[Dependents],
        'tenure':[tenure],
        'PhoneService':[PhoneService],
        'MultipleLines':[MultipleLines],
        'Contract':[Contract],
        'TotalCharges':[TotalCharges]
    }
    df1 = pd.DataFrame(data)
    
    #Encode the categorical columns
    categorical_columns = ['gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','Contract','TotalCharges']
    for column in  categorical_columns:
        df1[column] = label_encoder.fit_transform(df1[column])
    
    df1 = scaler.fit_transform(df1)
    result = lg.predict(df1).reshape(1,-1)
    return result[0]
    
    
    

In [76]:
df1

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges
0,Female,No,Yes,No,1,No,No phone service,Month-to-month,29.85


In [77]:
gender = 'Female'
SeniorCitizen = "No"
Partner = 'Yes'
Dependents ='No'
tenure=1
PhoneService = 'No'
MultipleLines="No phone service"
Contract='Month-to-month'
TotalCharges=29.85
result = predictive(gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges)

if result == 0:
    print("Not churn")
else:
    print('churn')






Not churn
