In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scoring_code import incr_act_top10

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

pd.set_option('display.max_columns', None)

# Clean Data
Features with more than 70% NAs within the feature will be dropped

In [2]:
data = pd.read_parquet('data/training_data.parquet')
features = pd.read_csv('data/ordered_feature_dictionary.csv')
feature_names = features['Feature Name'].values
data = data[feature_names]

to_drop = data.columns[data.isna().sum() >= (len(data.index) * 0.7)]
data = data.drop(to_drop, axis=1)

# data.describe(include='all').to_csv('data/descriptive_stats.csv')

data.head()

Unnamed: 0,customer,merchant,ind_recommended,activation,customer_profile_01,customer_profile_02,customer_profile_03,customer_profile_04,customer_spend_01,customer_spend_02,customer_spend_03,customer_spend_04,customer_spend_05,customer_spend_06,customer_spend_07,customer_spend_13,customer_spend_16,customer_spend_18,customer_spend_19,customer_digital_activity_01,customer_digital_activity_02,customer_digital_activity_03,customer_digital_activity_05,customer_digital_activity_06,customer_digital_activity_10,customer_digital_activity_11,customer_digital_activity_12,customer_digital_activity_13,customer_digital_activity_14,customer_digital_activity_15,customer_digital_activity_16,customer_digital_activity_17,customer_digital_activity_20,customer_digital_activity_21,customer_digital_activity_22,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_merchant_03,distance_01,distance_02,distance_03,distance_04,distance_05,merchant_profile_01,merchant_profile_02,merchant_profile_03,merchant_spend_01,merchant_spend_02,merchant_spend_03,merchant_spend_04,merchant_spend_05,merchant_spend_06,merchant_spend_07,merchant_spend_08,merchant_spend_09,merchant_spend_10
0,168972,152285,0,0,5466.06,1700.0,58.434969,86.0,107.215862,14.0,133.0,4477.0,29719.09,782.0,306.0,3.0,1.0,0.714531,20.85,0.0,32.5,,,,,,,,,,,,0.444444,0.017921,0.000468,26.686594,74.0,3682.75,138.0,111.0,0.90551,0.307692,1.219756,13.0,4.0,15.856826,101.0,0.157534,65923.0,29.781042,43.0,0.0,0.0,0.0,32.0,1429.49,48.0,49466.0,29.18
1,212404,39032,0,0,781.56,597.41,5.392089,125.0,35.552,2.0,8.0,17577.0,1051.4,52.0,43.0,1.0,1.0,0.871597,24.81,0.419355,7.0,0.0,,,,,,,,,,,,,,50.928261,3.0,1171.35,23.0,17.0,0.961583,3.808333,6.998555,1.0,3.808333,6.998555,403.0,0.084416,7801.0,34.643313,97.0,0.0,0.0,0.0,15.0,5646.86,163.0,3638.0,28.465
2,225178,7439,0,0,1457.84,1200.0,33.780445,180.0,31.623103,11.0,62.0,49494.0,4695.22,196.0,136.0,1.0,2.0,0.076536,32.26,0.836364,0.0,1.0,,,,,,,,,,,,0.0,0.0,48.837872,19.0,2295.38,47.0,42.0,0.327672,,0.129853,13.5,,1.753009,406.0,0.24,12868.0,1731.0,2.0,1731.0,2.0,2.0,8.0,3462.0,2.0,3912.0,421.5
3,183948,485069,0,0,351.22,500.0,37.340085,134.0,112.277391,16.0,33.0,,5190.94,167.0,112.0,,,,,0.952381,28.666667,0.0,1.0,4.0,20.0,19.0,19.0,13.0,19.0,7.0,3.0,0.0,,0.0,0.0,,,,,,0.769936,,,,2.0,9.000063,326.0,0.1875,23553.0,54.8,4.0,0.0,0.0,0.0,62.0,274.0,5.0,28919.0,50.0
4,210107,536004,1,0,831.67,99.0,77.794164,114.0,448.427273,5.0,8.0,,11713.96,33.0,28.0,,,,,0.754386,15.0,0.0,1.0,2.0,15.0,15.0,15.0,7.0,15.0,4.0,3.0,10.0,,0.0,0.0,,,,,,,,,,6.5,1.767939,326.0,0.428571,308.0,166.0,1.0,0.0,0.0,0.0,74.0,166.0,1.0,1086.0,69.509


# Create new target
reco + activated = 1
<br>else = 0
- reco + not activated
- no reco + activated
- **no reco + not activated** --> *not sure if we should have a separate category for this since having this is not exactly undesirable*

In [3]:
data['new_target'] = ((data['ind_recommended'] == 1) & (data['activation'] == 1)).astype(int)

data['new_target'].value_counts()

0    12220135
1        9843
Name: new_target, dtype: int64

# XGBoost without sampling

In [11]:
# Proceed with the original architecture using 'new_target' as the variable to predict
X = data.drop(columns=['customer', 'merchant', 'ind_recommended', 'activation', 'new_target'])
y = data['new_target']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=4287)

# Define XGBoost model
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Train the model
model.fit(X_train, y_train)

# Predict the probability of activation on the validation set
y_valid_pred_proba = model.predict_proba(X_valid)[:, 1]

# Add predictions back to the validation data
X_valid['predicted_score'] = y_valid_pred_proba

# Add the customer and merchant columns back for scoring
X_valid['customer'] = data.loc[X_valid.index, 'customer']
X_valid['merchant'] = data.loc[X_valid.index, 'merchant']
X_valid['ind_recommended'] = data.loc[X_valid.index, 'ind_recommended']
X_valid['activation'] = y_valid

# Calculate the Incremental Activation Rate using the provided function
incremental_activation_rate = incr_act_top10(input_df=X_valid, pred_col='predicted_score')

print('Incremental Activation Rate:', incremental_activation_rate)

   ind_recommended  avg_30d_act
0                0     0.000000
1                1     0.006478
Incremental Activation Rate: 0.006477789251053346


# XGBoost with class weights

In [14]:
# Proceed with the original architecture using 'new_target' as the variable to predict
X = data.drop(columns=['customer', 'merchant', 'ind_recommended', 'activation', 'new_target'])
y = data['new_target']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {0: weights[0], 1: weights[1]}

# Define XGBoost model with class weights
model = xgb.XGBClassifier(scale_pos_weight=class_weights[1], objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Train the model
print('Training the model...')
model.fit(X_train, y_train)

# Predict the probability of activation on the validation set
y_valid_pred_proba = model.predict_proba(X_valid)[:, 1]

# Add predictions back to the validation data
X_valid['predicted_score'] = y_valid_pred_proba

# Add the customer and merchant columns back for scoring
X_valid['customer'] = data.loc[X_valid.index, 'customer']
X_valid['merchant'] = data.loc[X_valid.index, 'merchant']
X_valid['ind_recommended'] = data.loc[X_valid.index, 'ind_recommended']
X_valid['activation'] = y_valid

# Calculate the Incremental Activation Rate using the provided function
incremental_activation_rate = incr_act_top10(input_df=X_valid, pred_col='predicted_score')

print('Incremental Activation Rate:', incremental_activation_rate)

Training the model...
   ind_recommended  avg_30d_act
0                0     0.000000
1                1     0.006559
Incremental Activation Rate: 0.006559340074507717


# Train & Predict Eval
Train on the full training data set

In [20]:
data = pd.read_parquet('data/training_data.parquet')
features = pd.read_csv('data/ordered_feature_dictionary.csv')
feature_names = features['Feature Name'].values
data = data[feature_names]

to_drop = data.columns[data.isna().sum() >= (len(data.index) * 0.7)]
data = data.drop(to_drop, axis=1)
data['new_target'] = ((data['ind_recommended'] == 1) & (data['activation'] == 1)).astype(int)

X = data.drop(columns=['customer', 'merchant', 'ind_recommended', 'activation', 'new_target'])
y = data['new_target']

In [21]:
eval_data = pd.read_parquet('data/evaluation_data.parquet')

eval_customer = eval_data['customer']
eval_merchant = eval_data['merchant']
eval_data = eval_data[X.columns] # X values for the evaluation data, final predictions made on this

eval_data.head()

Unnamed: 0,customer_profile_01,customer_profile_02,customer_profile_03,customer_profile_04,customer_spend_01,customer_spend_02,customer_spend_03,customer_spend_04,customer_spend_05,customer_spend_06,customer_spend_07,customer_spend_13,customer_spend_16,customer_spend_18,customer_spend_19,customer_digital_activity_01,customer_digital_activity_02,customer_digital_activity_03,customer_digital_activity_05,customer_digital_activity_06,customer_digital_activity_10,customer_digital_activity_11,customer_digital_activity_12,customer_digital_activity_13,customer_digital_activity_14,customer_digital_activity_15,customer_digital_activity_16,customer_digital_activity_17,customer_digital_activity_20,customer_digital_activity_21,customer_digital_activity_22,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_merchant_03,distance_01,distance_02,distance_03,distance_04,distance_05,merchant_profile_01,merchant_profile_02,merchant_profile_03,merchant_spend_01,merchant_spend_02,merchant_spend_03,merchant_spend_04,merchant_spend_05,merchant_spend_06,merchant_spend_07,merchant_spend_08,merchant_spend_09,merchant_spend_10
0,385.25,565.24,72.268283,423.0,112.334,4.0,41.0,104895.0,3638.63,101.0,70.0,1.0,1.0,1.711261,56.985,0.8,7.0,0.0,1.0,2.0,10.0,10.0,10.0,8.0,10.0,5.0,3.0,0.0,,,,80.5525,9.0,966.63,12.0,10.0,0.321133,,,,,1.621171,406.0,0.4375,4777.0,42.826923,12.0,42.826923,12.0,13.0,65.0,556.75,13.0,26299.0,33.3
1,385.25,565.24,72.268283,423.0,112.334,4.0,41.0,,3638.63,101.0,70.0,,,0.067151,53.27,0.8,7.0,0.0,1.0,2.0,10.0,10.0,10.0,8.0,10.0,5.0,3.0,0.0,1.0,0.0,0.000183,,,,,,0.884368,,,,5.5,2.441944,405.0,0.397059,4803.0,1591.655,2.0,487.02,1.0,1.0,4.0,3183.31,2.0,7122.0,793.29
2,385.25,565.24,72.268283,423.0,112.334,4.0,41.0,8990.0,3638.63,101.0,70.0,3.0,9.0,0.61,61.0,0.8,7.0,0.0,1.0,2.0,10.0,10.0,10.0,8.0,10.0,5.0,3.0,0.0,,0.0,0.0,71.1925,3.0,284.77,4.0,4.0,,4.4,1.950466,1.25,5.5,2.438082,202.0,,14860.0,,,,,,69.0,,,7222.0,100.0
3,385.25,565.24,72.268283,423.0,112.334,4.0,41.0,,3638.63,101.0,70.0,,,,,0.8,7.0,0.0,1.0,2.0,10.0,10.0,10.0,8.0,10.0,5.0,3.0,0.0,,,,,,,,,0.8089,,,,,2.072182,319.0,0.142857,11968.0,,,,,,-999.0,,,11410.0,252.38
4,385.25,565.24,72.268283,423.0,302.7925,3.0,37.0,,4069.02,92.0,66.0,,,,,0.8,7.0,0.0,1.0,2.0,10.0,10.0,10.0,8.0,10.0,5.0,3.0,0.0,,0.0,0.0,,,,,,0.725993,,,,10.0,2.380853,414.0,0.1,5842.0,410.0,4.0,410.0,4.0,4.0,33.0,1640.0,4.0,1847.0,87.5


In [22]:
# Calculate class weights
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = {0: weights[0], 1: weights[1]}

# Define XGBoost model with class weights
model = xgb.XGBClassifier(scale_pos_weight=class_weights[1], objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Train the model
print('Training the model...')
model.fit(X, y)

# Predict the probability of activation on the evaluation set
print('Making predictions...')
eval_pred_proba = model.predict_proba(eval_data)[:, 1]

# create dataframe with predictions according to submission guidelines (customer, merchant, predicted_score)
submission = pd.DataFrame({'customer': eval_customer, 'merchant': eval_merchant, 'predicted_score': eval_pred_proba})

# Save the submission to a CSV file
print('Saving the submission...')
submission.to_csv('data/submission_v1.csv', index=False)

Training the model...
Making predictions...
Saving the submission...
