In [18]:
import zipfile
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
zf = zipfile.ZipFile("playground-series-s4e1.zip")
zf.namelist()

['sample_submission.csv', 'test.csv', 'train.csv']

In [4]:
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [6]:
#Checking to see if any rows/columns are missing any data
df_train.describe()

Unnamed: 0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0
mean,82516.5,15692010.0,656.454373,38.125888,5.020353,55478.086689,1.554455,0.753954,0.49777,112574.822734,0.211599
std,47641.3565,71397.82,80.10334,8.867205,2.806159,62817.663278,0.547154,0.430707,0.499997,50292.865585,0.408443
min,0.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,41258.25,15633140.0,597.0,32.0,3.0,0.0,1.0,1.0,0.0,74637.57,0.0
50%,82516.5,15690170.0,659.0,37.0,5.0,0.0,2.0,1.0,0.0,117948.0,0.0
75%,123774.75,15756820.0,710.0,42.0,7.0,119939.5175,2.0,1.0,1.0,155152.4675,0.0
max,165033.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [11]:
#Checking if any columns have words that need to be converted to integers/floats for prediction calculations

df_train.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [10]:
df_dummy_train = pd.get_dummies(df_train, drop_first= False, dtype=float)

df_dummy_train.head(5)

Unnamed: 0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,...,Surname_Zubarev,Surname_Zubareva,Surname_Zuev,Surname_Zuyev,Surname_Zuyeva,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0,15674932,668,33.0,3,0.0,2,1.0,0.0,181449.97,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,15749177,627,33.0,1,0.0,2,1.0,1.0,49503.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2,15694510,678,40.0,10,0.0,2,1.0,0.0,184866.69,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,3,15741417,581,34.0,2,148882.54,1,1.0,1.0,84560.88,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,15766172,716,33.0,5,0.0,2,1.0,1.0,15068.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [16]:
y = df_dummy_train['Exited']
x = df_dummy_train.drop('Exited', axis =1)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state=42)

In [19]:
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(x_train, y_train)

In [20]:
y_pred = model.predict(x_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26052
           1       0.75      0.57      0.65      6955

    accuracy                           0.87     33007
   macro avg       0.82      0.76      0.78     33007
weighted avg       0.86      0.87      0.86     33007

