# GDATABT
## Ahmet Emre Usta
## Engincan Göçer
### Hacettepe University
### 25.04.2023

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_train.head()

Unnamed: 0,UserID,UserFirstTransaction,Gender,Location,Age,isChurn,TotalPrice,TotalSales,TotalDiscount,UserLastTransaction,...,Indoor Sports,Kitchen Electronics,Male Fashion,Male Shoes,Outdoor Sports,Smart Phones,Smart Watches,Sound Systems,Sport Shoes,TVs and TV Sets
0,500234730,2017-01-24 00:00:00.000,MALE,ISTANBUL,33,1,28.575,2,1,2017-01-29 18:49:51.470,...,0,0,0,1,0,0,0,0,0,0
1,500234752,2013-10-24 00:00:00.000,MALE,ISTANBUL,34,0,310.05,4,0,2017-06-25 18:57:54.810,...,0,0,0,0,0,1,0,0,1,1
2,500234829,2009-09-01 00:00:00.000,MALE,ESKISEHIR,42,0,44.85,1,0,2017-02-12 15:23:21.330,...,0,0,0,0,0,0,0,0,0,1
3,500234840,2017-05-31 19:20:26.970,FEMALE,IZMIR,39,1,69.0,2,0,2017-05-31 19:22:57.230,...,0,0,0,0,0,0,0,0,0,0
4,500235181,2010-03-25 00:00:00.000,FEMALE,ANKARA,40,0,845.1,21,0,2017-06-30 15:19:51.310,...,0,1,1,0,0,0,0,3,0,4


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1674 entries, 0 to 1673
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UserID                   1674 non-null   int64  
 1   UserFirstTransaction     1674 non-null   object 
 2   Gender                   1674 non-null   object 
 3   Location                 1674 non-null   object 
 4   Age                      1674 non-null   int64  
 5   isChurn                  1674 non-null   int64  
 6   TotalPrice               1674 non-null   float64
 7   TotalSales               1674 non-null   int64  
 8   TotalDiscount            1674 non-null   int64  
 9   UserLastTransaction      1674 non-null   object 
 10  UserActiveDays           1674 non-null   int64  
 11  TotalWeb                 1674 non-null   int64  
 12  TotalMobile              1674 non-null   int64  
 13  TotalCashPayment         1674 non-null   int64  
 14  TotalMobilPayment       

In [6]:
# print the object type columns
df_train.select_dtypes(include=['object']).columns

Index(['UserFirstTransaction', 'Gender', 'Location', 'UserLastTransaction'], dtype='object')

In [7]:
# drop UserID column
df_train = df_train.drop(['UserID'], axis=1)

In [8]:
# Train test split
y = df_train['isChurn']
X = df_train.drop(['isChurn'], axis=1)

In [9]:
le = LabelEncoder()
X["Gender"] = le.fit_transform(X["Gender"])

In [10]:
# one hot encoding the Location column
X = pd.get_dummies(X, columns=['Location'])

In [11]:
# convert object type to datetime
X['UserFirstTransaction'] = pd.to_datetime(X['UserFirstTransaction'])
X['UserLastTransaction'] = pd.to_datetime(X['UserLastTransaction'])

In [12]:
# encode the date columns
X["UserFirstTransaction"] = X["UserFirstTransaction"].apply(lambda x: x.toordinal())
X["UserLastTransaction"] = X["UserLastTransaction"].apply(lambda x: x.toordinal())

In [13]:
X.head()

Unnamed: 0,UserFirstTransaction,Gender,Age,TotalPrice,TotalSales,TotalDiscount,UserLastTransaction,UserActiveDays,TotalWeb,TotalMobile,...,TVs and TV Sets,Location_ADANA,Location_ANKARA,Location_ANTALYA,Location_BURSA,Location_ESKISEHIR,Location_ISTANBUL,Location_IZMIR,Location_KAYSERI,Location_TRABZON
0,736353,1,33,28.575,2,1,736358,5,0,2,...,0,0,0,0,0,0,1,0,0,0
1,735165,1,34,310.05,4,0,736505,1340,2,2,...,1,0,0,0,0,0,1,0,0,0
2,733651,1,42,44.85,1,0,736372,2721,1,0,...,1,0,0,0,0,1,0,0,0,0
3,736480,0,39,69.0,2,0,736480,0,2,0,...,0,0,0,0,0,0,0,1,0,0
4,733856,0,40,845.1,21,0,736510,2654,21,0,...,4,0,1,0,0,0,0,0,0,0


In [14]:
# Define the parameter grid to search over
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [50, 100, 200]
}

# Initialize the XGBoost classifier
clf = xgb.XGBClassifier(random_state=42)

# Initialize the GridSearchCV object with the specified parameters
grid_search = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))

# Fit the GridSearchCV object to the training data
grid_search.fit(X, y)

# Print the best hyperparameters found by the grid search
print("Best parameters:", grid_search.best_params_)

# Print the best score found by the grid search
print("Best score (ROC AUC):", grid_search.best_score_)


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best score (ROC AUC): 0.8201478426918849


In [15]:
#save the model
pickle.dump(grid_search, open('xgboost.pkl', 'wb'))