In [204]:
# Import packages
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Data Pre-processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,FunctionTransformer,PowerTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')

In [205]:
df = pd.read_csv('../data/processed/fe_data.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [206]:
df.head()

Unnamed: 0,gender,age,married,number_of_dependents,city,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,...,unlimited_data,contract,paperless_billing,payment_method,monthly_charge,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status
0,Female,37,Yes,0,Frazier Park,2,9,no_offer,Yes,42.39,...,Yes,One Year,Yes,Credit Card,65.6,0.0,0,381.51,974.81,Stayed
1,Male,46,No,0,Glendale,0,9,no_offer,Yes,10.69,...,No,Month-to-Month,No,Credit Card,-4.0,38.33,10,96.21,610.28,Stayed
2,Male,50,No,0,Costa Mesa,0,4,Offer E,Yes,33.65,...,Yes,Month-to-Month,Yes,Bank Withdrawal,73.9,0.0,0,134.6,415.45,Churned
3,Male,78,Yes,0,Martinez,1,13,Offer D,Yes,27.82,...,Yes,Month-to-Month,Yes,Bank Withdrawal,98.0,0.0,0,361.66,1599.51,Churned
4,Female,75,Yes,0,Camarillo,3,3,no_offer,Yes,7.38,...,Yes,Month-to-Month,Yes,Credit Card,83.9,0.0,0,22.14,289.54,Churned


In [207]:
df.isnull().sum()

gender                                 0
age                                    0
married                                0
number_of_dependents                   0
city                                   0
number_of_referrals                    0
tenure_in_months                       0
offer                                  0
phone_service                          0
avg_monthly_long_distance_charges      0
multiple_lines                       682
internet_service                       0
internet_type                          0
avg_monthly_gb_download                0
online_security                        0
online_backup                          0
device_protection_plan                 0
premium_tech_support                   0
streaming_tv                           0
streaming_movies                       0
streaming_music                        0
unlimited_data                         0
contract                               0
paperless_billing                      0
payment_method  

In [208]:
numeric_data = df.select_dtypes(include=['number'])
skweed = []
for i in numeric_data:
    print(f"{i} skewness: {df[i].skew()}")
    if(df[i].skew() >0.5 or df[i].skew()<-0.5):
        skweed.append(i)


age skewness: 0.1621864486741778
number_of_dependents skewness: 2.1099319807722305
number_of_referrals skewness: 1.4460596247412694
tenure_in_months skewness: 0.24054261407486305
avg_monthly_long_distance_charges skewness: 0.04917589933841267
avg_monthly_gb_download skewness: 1.2165839031452528
monthly_charge skewness: -0.27539383432523457
total_refunds skewness: 4.328516700962804
total_extra_data_charges skewness: 4.091209238564625
total_long_distance_charges skewness: 1.2382819839547003
total_revenue skewness: 0.9194102679721669


In [209]:
skweed

['number_of_dependents',
 'number_of_referrals',
 'avg_monthly_gb_download',
 'total_refunds',
 'total_extra_data_charges',
 'total_long_distance_charges',
 'total_revenue']

In [210]:
x = df.drop(columns=['customer_status'])
y = df['customer_status']

In [211]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [212]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = list(set(X_train.columns.tolist()) - set(categorical_features))

In [213]:
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [214]:
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [215]:
# outlier data
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

In [216]:
outlier_detection = numeric_data
for k, v in outlier_detection.items():
        outliers = find_outliers_IQR(v)
        perc = len(find_outliers_IQR(v)) * 100.0 / np.shape(outlier_detection)[0]
        print(f'Column :{k}')
        print(f"Number of Outliers = {len(outliers)}","||",f"Max Outlier Value = {outliers.max()}",
              "||", f"Min Outlier Value = {outliers.min()}", "||", f"Percentage of Outliers = {perc:.2f}%")
        print("\n")

Column :age
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :number_of_dependents
Number of Outliers = 1627 || Max Outlier Value = 9 || Min Outlier Value = 1 || Percentage of Outliers = 23.10%


Column :number_of_referrals
Number of Outliers = 676 || Max Outlier Value = 11 || Min Outlier Value = 8 || Percentage of Outliers = 9.60%


Column :tenure_in_months
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_long_distance_charges
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :avg_monthly_gb_download
Number of Outliers = 362 || Max Outlier Value = 85.0 || Min Outlier Value = 69.0 || Percentage of Outliers = 5.14%


Column :monthly_charge
Number of Outliers = 0 || Max Outlier Value = nan || Min Outlier Value = nan || Percentage of Outliers = 0.00%


Column :total_r

In [217]:
def handle_outliers(X_train, X_test, columns):
    for col in columns:
        Q1 = np.percentile(X_train[col], 25)
        Q3 = np.percentile(X_train[col], 75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Handling outliers in X_train
        X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])
        X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])

        # Handling outliers in X_test based on X_train IQR
        X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])
        X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])

    return X_train, X_test


# Define the columns to apply outlier handling
columns_to_handle = ['avg_monthly_gb_download', 'total_revenue', 'number_of_dependents',
                     'total_refunds', 'total_extra_data_charges','number_of_referrals']

# Apply the function to the specified columns
X_train, X_test = handle_outliers(X_train, X_test, columns_to_handle)

In [218]:
pt = PowerTransformer(method='yeo-johnson')
X_train[skweed] = pt.fit_transform(X_train[skweed])
X_test[skweed] = pt.transform(X_test[skweed])
X_train = pd.DataFrame(X_train, columns=x.columns)
X_test = pd.DataFrame(X_test, columns=x.columns)

In [219]:
numeric_data = df.select_dtypes(include=['number'])
for i in numeric_data:
    print(f"{i} skewness: {X_train[i].skew()}")


age skewness: 0.17485867050241055
number_of_dependents skewness: 0.0
number_of_referrals skewness: 0.43356661322859463
tenure_in_months skewness: 0.23985561011067047
avg_monthly_long_distance_charges skewness: 0.043319959016210864
avg_monthly_gb_download skewness: -0.18569373922255947
monthly_charge skewness: -0.27145577214589467
total_refunds skewness: 0.0
total_extra_data_charges skewness: 0.0
total_long_distance_charges skewness: -0.2181504249594066
total_revenue skewness: -0.15748008527977778


In [220]:
oe = OrdinalEncoder()
X_train[categorical_features] = oe.fit_transform(X_train[categorical_features])
X_test[categorical_features] = oe.fit_transform(X_test[categorical_features])


In [221]:
sc = StandardScaler()
X_train[numerical_features] = sc.fit_transform(X_train[numerical_features])
X_test[numerical_features] = sc.fit_transform(X_test[numerical_features])

In [222]:
y_train.value_counts()

customer_status
Stayed     3781
Churned    1496
Joined      357
Name: count, dtype: int64

In [223]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [224]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train,y_train)

In [225]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

In [226]:
y_pred = rf.predict(X_test)

In [227]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# ...existing code...

Accuracy: 0.8254080908445706
              precision    recall  f1-score   support

           0       0.67      0.67      0.67       373
           1       0.72      0.66      0.69        97
           2       0.90      0.90      0.90       939

    accuracy                           0.83      1409
   macro avg       0.76      0.75      0.75      1409
weighted avg       0.82      0.83      0.82      1409

