In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load data
data = pd.read_csv("files_for_lab/Customer-Churn.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
# Check data types
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [4]:
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors='coerce')
data.TotalCharges = data.TotalCharges.fillna(0)

In [5]:
# X-y split
X = data.drop("Churn", axis=1)
X = pd.get_dummies(X)
y = data.Churn

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [8]:
# Apply SMOTE for upsampling the data
X_sm, y_sm = SMOTE().fit_resample(np.array(X_train), y_train)
y_sm.value_counts()

Yes    3892
No     3892
Name: Churn, dtype: int64

In [9]:
# Use Random Forest Classifier to fit the model
model = RandomForestClassifier(criterion='gini', max_depth=None, random_state=42, bootstrap=True, n_jobs=-1)
model.fit(X_sm, y_sm)

# Compute the accuracy of the model
model.score(X_test, y_test)

0.7677455990914254