In [1]:
#BHAVYA GUPTA ML TASK 1, CUSTOMER CHURN.
#Importing the necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


In [2]:

# Load the dataset
data = pd.read_csv('Churn-Data.csv')


In [3]:

# Display the column names to verify
print(data.columns)


Index(['cID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'TV_Streaming',
       'Movie_Streaming', 'Contract', 'PaperlessBilling', 'Method_Payment',
       'Charges_Month', 'TotalCharges', 'Churn'],
      dtype='object')


In [6]:

#CHecking the datatypes of all the columns
data.dtypes


cID                  object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
TV_Streaming         object
Movie_Streaming      object
Contract             object
PaperlessBilling     object
Method_Payment       object
Charges_Month       float64
TotalCharges         object
Churn                object
dtype: object

In [8]:

# Handling the missing values
data = data.dropna()


In [9]:

# Converting the target column 'Churn' to numerical with the help of label encoder
label_encoder = LabelEncoder()
data['Churn'] = label_encoder.fit_transform(data['Churn'])


In [10]:

# Displaying the first 5 rows 
print(data.head())


          cID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  4223-BKEOR  Female              0      No        Yes      21          Yes   
1  6035-RIIOM  Female              0      No         No      54          Yes   
2  3797-VTIDR    Male              0     Yes         No       1           No   
3  2568-BRGYX    Male              0      No         No       4          Yes   
4  2775-SEFEE    Male              0      No        Yes       0          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0                No             DSL            Yes  ...              Yes   
1               Yes     Fiber optic             No  ...               No   
2  No phone service             DSL             No  ...               No   
3                No     Fiber optic             No  ...               No   
4               Yes             DSL            Yes  ...               No   

  TechSupport TV_Streaming Movie_Streaming        Contract Pap

In [11]:

# Converting the categorical features to numerical, 
obj_cols = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=obj_cols, drop_first=True)


In [12]:

# Normalizing numerical features with the help of standard scaler
num_cols = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])


In [13]:

# Split the data into train and test sets
X = data.drop('Churn', axis=1)  # Drop the target column 'Churn'
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:

# Handle imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


In [15]:

# Training a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [16]:

# Predicting on the test set
y_pred = model.predict(X_test)


In [17]:

# Printing out the predicted valuse
y_pred


array([0, 1, 1, ..., 0, 0, 1])

In [18]:

# Evaluating the model using accuracy and f1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [19]:

#Final results of our model
print(f'Accuracy: {accuracy}')
print(f'F1-score: {f1}')


Accuracy: 0.7089618456078084
F1-score: 0.6029055690072639
