In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./datasets/Data_Science_Challenge.csv")

In [3]:
df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["churn"].values, random_state=77)

In [4]:
df_train["churn"].value_counts()

churn
False    1995
True      338
Name: count, dtype: int64

### oversampling and undersampling

In [5]:
from imblearn.over_sampling import RandomOverSampler
rus = RandomOverSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df_train.drop(columns=["churn"]), df_train["churn"])
df_train = pd.concat([X_resampled, y_resampled], axis="columns")
df_train.reset_index(drop=True, inplace=True)

print(df_train["churn"].value_counts())


churn
False    1995
True     1995
Name: count, dtype: int64


In [6]:
def remove_columns(dataframe):
    x = dataframe.copy()
    # , "total intl minutes"
    mins_cols = ["total day minutes", "total eve minutes", "total night minutes"]
    # , "total intl charge"
    charge_cols = ["total day charge", "total eve charge","total night charge"]
    # , "total intl calls"
    calls_cols = ["total day calls", "total eve calls", "total night calls"]

    x["total_minutes"] = x[mins_cols].sum(axis=1)
    x = x.drop(columns=mins_cols)
    
    x["total_charge"] = x[charge_cols].sum(axis=1)
    x = x.drop(columns=charge_cols)
    
    x["total_calls"] = x[calls_cols].sum(axis=1)
    x = x.drop(columns=calls_cols)
    return x.drop(columns=["phone number"])

train_data = remove_columns(df_train)

In [7]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [8]:
object_cols = [col for col in train_data.columns if train_data[col].dtype=="O"]
encoded_data = one_hot_encoder.fit_transform(train_data[object_cols])
cats = [cat for sub_categories in one_hot_encoder.categories_ for cat in sub_categories]
cats = [f"{cat}_{idx}" for idx, cat in enumerate(cats)]

In [9]:
train_data = pd.concat([train_data.reset_index(drop=True), pd.DataFrame(encoded_data, columns=cats)], axis="columns")

In [10]:
from xgboost import XGBClassifier
from collections import Counter
cnt_y_train = Counter(train_data["churn"])
scale_pos_weight = cnt_y_train[0]/cnt_y_train[1]
clf = XGBClassifier(objective="binary:logistic", scale_pos_weight=scale_pos_weight)
print("scale_pos_weight ", scale_pos_weight)

scale_pos_weight  1.0


In [11]:
y_train = train_data["churn"]
train_data = train_data.drop(columns=object_cols+["churn"])

clf.fit(train_data, y_train)

In [12]:
test_data = remove_columns(df_test)
test_data = pd.concat([test_data.reset_index(drop=True), pd.DataFrame(one_hot_encoder.transform(test_data[object_cols]), 
                                                          columns=cats)], axis="columns")
test_data.drop(columns=object_cols, inplace=True)

In [13]:
y_test = test_data["churn"]
test_data = test_data.drop(columns=["churn"])

In [14]:
y_pred = clf.predict(test_data)

from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       855
        True       0.97      0.87      0.92       145

    accuracy                           0.98      1000
   macro avg       0.97      0.93      0.95      1000
weighted avg       0.98      0.98      0.98      1000



### F1 Score

In [15]:
from sklearn.metrics import f1_score
print("F1 SCORE: ", f1_score(y_true=y_test, y_pred=y_pred))

F1 SCORE:  0.9163636363636363
