In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./datasets/Data_Science_Challenge.csv")

In [3]:
df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df["churn"], random_state=77)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2333 entries, 906 to 2682
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   2333 non-null   object 
 1   account length          2333 non-null   int64  
 2   area code               2333 non-null   int64  
 3   phone number            2333 non-null   object 
 4   international plan      2333 non-null   object 
 5   voice mail plan         2333 non-null   object 
 6   number vmail messages   2333 non-null   int64  
 7   total day minutes       2333 non-null   float64
 8   total day calls         2333 non-null   int64  
 9   total day charge        2333 non-null   float64
 10  total eve minutes       2333 non-null   float64
 11  total eve calls         2333 non-null   int64  
 12  total eve charge        2333 non-null   float64
 13  total night minutes     2333 non-null   float64
 14  total night calls       2333 non-null   int

In [5]:
df_train["churn"].value_counts()

churn
False    1995
True      338
Name: count, dtype: int64

In [6]:
df_train[["account length", "area code", "phone number"]].sample(5)

Unnamed: 0,account length,area code,phone number
3080,82,415,415-8200
895,106,415,343-2350
2185,126,415,386-9711
932,74,415,366-5918
2711,124,510,359-9223


In [7]:
def remove_columns(dataframe):
    x = dataframe.copy()
    x["total_minutes"] = x[["total day minutes", "total eve minutes", "total night minutes", "total intl minutes"]].sum(axis=1)
    x = x.drop(columns=["total day minutes", "total eve minutes", "total night minutes", "total intl minutes"])
    x["total_charge"] = x[["total day charge", "total eve charge","total night charge", "total intl charge"]].sum(axis=1)
    x = x.drop(columns=["total day charge", "total eve charge","total night charge", "total intl charge"])
    x["total_calls"] = x[["total day calls", "total eve calls", "total night calls", "total intl calls"]].sum(axis=1)
    x = x.drop(columns=["phone number", "total day calls", "total eve calls", "total night calls", "total intl calls"])
    return x

train_data = remove_columns(df_train)


In [8]:
train_data[train_data.duplicated(subset="area code", keep=False)].groupby(by=["area code"])["state"].unique()
    

area code
408    [VT, CA, NC, KS, ND, ID, LA, ME, WI, NV, IA, F...
415    [RI, MT, VA, IL, NE, WV, PA, UT, CA, MO, SC, M...
510    [DE, MS, MN, AZ, ID, NH, MT, TN, KS, WV, PA, O...
Name: state, dtype: object

In [9]:
object_cols = [col for col in train_data.columns if train_data[col].dtype=="O"]
print(object_cols)

['state', 'international plan', 'voice mail plan']


In [10]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [11]:
encoded_data = one_hot_encoder.fit_transform(train_data[object_cols])
cats = [cat for sub_categories in one_hot_encoder.categories_ for cat in sub_categories]
cats = [f"{cat}_{idx}" for idx, cat in enumerate(cats)]


In [12]:

train_data = pd.concat([train_data.reset_index(drop=True), pd.DataFrame(encoded_data, columns=cats)], axis="columns")

In [13]:
from xgboost import XGBClassifier

clf = XGBClassifier(objective="binary:logistic")

In [14]:
y_train = train_data["churn"]
train_data = train_data.drop(columns=object_cols+["churn"])

clf.fit(train_data, y_train)

In [15]:
test_data = remove_columns(df_test)
test_data = pd.concat([test_data.reset_index(drop=True), pd.DataFrame(one_hot_encoder.transform(test_data[object_cols]), 
                                                          columns=cats)], axis="columns")
test_data.drop(columns=object_cols, inplace=True)

In [16]:
y_test = test_data["churn"]
test_data = test_data.drop(columns=["churn"])

In [17]:
y_pred = clf.predict(test_data)

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       False       0.95      0.99      0.97       855
        True       0.93      0.70      0.80       145

    accuracy                           0.95      1000
   macro avg       0.94      0.85      0.89      1000
weighted avg       0.95      0.95      0.95      1000

