## Import packages

In [14]:
import numpy as np
import pandas as pd 
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
import scipy.stats as stats

## Import data

In [5]:
customer_response = pd.read_csv("customer_response_final.csv", sep = ";")
customer_response.head()

Unnamed: 0,individualnumber,discount_type1_count,discount_type1_mean,discount_type2_count,discount_type2_mean,discount_type3_count,discount_type3_mean,Hygiene_Quantity,Hygiene_Amount,Other_Quantity,...,PersonalCare_Amount,Beverage_Quantity,Beverage_Amount,isvirtual_ratio,gender,dateofbirth,category_number,deserved_amount,reward_amount,response
0,66365473,13,0.576231,9,2.438889,1,34.98,2.0,24.5,17.0,...,0.0,13.0,129.65,0.0,Female,1990,9020,8.0,1.0,0
1,82571608,53,1.303816,52,5.461923,28,6.682857,31.0,386.95,22.0,...,117.88,74.0,433.72,0.0,Male,1992,9035,48.0,4.0,0
2,37719829,9,2.710704,5,1.19,12,3.169167,15.0,301.41,0.0,...,0.0,6.0,34.4,0.0,Female,1978,9012,28.0,2.0,0
3,99398913,854,1.255426,99,5.23303,256,4.255898,159.0,2742.4,41.0,...,402.38,508.0,2236.3,0.0,Male,1997,9012,31.0,4.0,0
4,54766363,14,2.558548,12,5.929167,3,5.096667,18.0,261.65,20.0,...,0.0,16.0,155.71,0.0,Male,1968,9005,32.0,3.0,0


In [6]:
customer_response.isnull().values.any()

False

### Z-Score Normalization

In [7]:
Columns_norm = ['reward_amount', 'deserved_amount', 'Hygiene_Quantity', 'Hygiene_Amount',
       'Other_Quantity', 'Other_Amount', 'Food_Quantity', 'Food_Amount',
       'PersonalCare_Quantity', 'PersonalCare_Amount', 'Beverage_Quantity',
       'Beverage_Amount']

In [8]:
cr = customer_response.copy()

In [9]:
for i in Columns_norm:
    cr[i] = stats.zscore(cr[i])

### Split X and y

In [10]:
X = cr.drop(columns = ['response','individualnumber','gender','dateofbirth'])
y = cr['response']

---

### XGBClassifier 

In [202]:
# create a 5-fold cross validation iterator 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# specify the hyperparameters to tune and their possible values
param_grid = {
    'max_depth': [10],
    'scale_pos_weight': [150],
    'learning_rate': [0.01],
    'n_estimators': [1200],
    'random_state': [42, 58],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'min_child_weight': [6 ,8, 10]
}

# create the model to tune
model = xgb.XGBClassifier()

# create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='f1', n_jobs=-1)

# fit the GridSearchCV object to the data
grid_search.fit(X, y)

# print the best hyperparameters
print("Best hyperparameters: {}".format(grid_search.best_params_))

Best hyperparameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 8, 'n_estimators': 1200, 'random_state': 42, 'scale_pos_weight': 150, 'subsample': 0.8}


In [203]:
#initialize an empty list to store the F1 scores for each fold
f1_scores = []

for train_index, val_index in cv.split(X, y):
    # split the data into train and validation sets for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # create the XGBoost model and fit it to the training data
    model = xgb.XGBClassifier(**grid_search.best_params_)
    model.fit(X_train, y_train)

    # make predictions on the validation set
    y_pred = model.predict(X_val)

    # calculate the F1 score for this fold
    fold_f1_score = f1_score(y_val, y_pred)
    f1_scores.append(fold_f1_score)

    print("Fold F1 score: {:.4f}".format(fold_f1_score))

Fold F1 score: 0.3953
Fold F1 score: 0.4301
Fold F1 score: 0.4773
Fold F1 score: 0.4545
Fold F1 score: 0.4898


In [205]:
# calculate the mean F1 score across all folds
mean_f1score_XGB = sum(f1_scores) / len(f1_scores)
print("Mean F1 score: {:.4f}".format(mean_f1_score))

Mean F1 score: 0.4494


---

### Logistic Regression

In [None]:
# Define the hyperparameters to tune
param_grid = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}

# Initialize the logistic regression model
log_reg = LogisticRegression()

# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the grid search object
grid_search = GridSearchCV(log_reg, param_grid, cv=kf, scoring='f1')

# Fit the grid search object to the data
grid_search.fit(X, y)

In [24]:
# Print the best hyperparameters
print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}


In [25]:
# Print the best accuracy
print('Best Accuracy:', grid_search.best_score_)

Best Accuracy: 0.16319936779743344


In [None]:
# Perform cross-validation using the best parameters
log_reg_best = LogisticRegression(**grid_search.best_params_)
cv_scores = cross_val_score(log_reg_best, X, y, cv=kf, scoring='f1')

# Print the cross-validation scores
print('Cross-validation scores:', cv_scores

Cross-validation scores: 0.16326531 0.15686275 0.19607843 0.11111111 0.18867925

In [27]:
print('Mean cross-validation score:', np.mean(cv_scores));

Mean cross-validation score: 0.16319936779743344


---

### GradientBoostingClassifier

In [18]:
# Create a StratifiedKFold object
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the hyperparameters to tune
param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [3, 5, 10]}

# Initialize the gradient boosting classifier
gbc = GradientBoostingClassifier()

# Initialize the grid search object
grid_search = GridSearchCV(gbc, param_grid, cv=kf, scoring='f1_macro')

# Fit the grid search object to the data
grid_search.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=GradientBoostingClassifier(),
             param_grid={'max_depth': [3, 5, 10],
                         'n_estimators': [50, 100, 150]},
             scoring='f1_macro')

In [19]:
# Print the best hyperparameters
print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'max_depth': 3, 'n_estimators': 100}


In [20]:
# Print the best accuracy
print('Best F1 Score:', grid_search.best_score_)

Best F1 Score: 0.6377671010577799


In [21]:
# Perform cross-validation using the best parameters
gbc_best = GradientBoostingClassifier(**grid_search.best_params_)
cv_scores = cross_val_score(gbc_best, X, y, cv=kf, scoring='f1_macro')

# Print the cross-validation scores
print('Cross-validation F1 Scores:', cv_scores)

Cross-validation F1 Scores: [0.64590962 0.61442932 0.63168345 0.60702551 0.68360136]


In [22]:
print('Mean cross-validation F1 score:', np.mean(cv_scores))

Mean cross-validation F1 score: 0.6365298520132576
