# **Big Data Intelligence Project - TMALL Repeat Buyers**
### Armando Fortes, David Pissarra, Gabriele Oliaro

#### Imports

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

#### Constants and Hyperparameters

In [5]:
DATA_DIR_1 = '../data_format1/'
DATA_DIR_2 = '../data_format2/'
TRAIN_PATH = DATA_DIR_1 + 'train_format1.csv'
TEST_PATH = DATA_DIR_1 + 'test_format1.csv'
USER_INFO_PATH = DATA_DIR_1 + 'user_info_format1.csv'
USER_LOG_PATH = DATA_DIR_1 + 'user_log_format1.csv'

In [6]:
VALID_SET_SIZE = 0.2
PCA_COMPONENTS = 5
RANDOM_SEED = 42
EPSILON = 1e-10

### **Data Pre-Processing**

#### Read CSVs

In [8]:
df_train = pd.read_csv(TRAIN_PATH)
df_user_info = pd.read_csv(USER_INFO_PATH)
df_user_log = pd.read_csv(USER_LOG_PATH)
df_test = pd.read_csv(TEST_PATH)
df_test.drop('prob', axis=1, inplace=True)

#### Optimize memory usage and Data Cleaning

In [9]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

2.86 GB


In [10]:
df_user_log['user_id'] = df_user_log['user_id'].astype(np.int32)
df_user_log['item_id'] = df_user_log['item_id'].astype(np.int32)
df_user_log['cat_id'] = df_user_log['cat_id'].astype(np.int16)
df_user_log['seller_id'] = df_user_log['seller_id'].astype(np.int16)
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
df_user_log['brand_id'].fillna(0, inplace=True)
df_user_log['brand_id'] = df_user_log['brand_id'].astype(np.int16)
df_user_log['time_stamp'] = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format='%m%d')).dt.days
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype(np.int16)
df_user_log['action_type'] = df_user_log['action_type'].astype(np.int8)

In [11]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

0.87 GB


In [12]:
df_user_info['age_range'].fillna(0, inplace=True)
df_user_info['gender'].fillna(2, inplace=True)
df_user_info['age_range'] = df_user_info['age_range'].astype(np.int8)
df_user_info['gender'] = df_user_info['gender'].astype(np.int8)

#### Feature Engineering

In [13]:
users = df_user_log.groupby('user_id')
merchants = df_user_log.groupby('merchant_id')
users_merchants = df_user_log.groupby(['user_id', 'merchant_id'])

In [14]:
# transform age categorical features into different binary features
to_merge = pd.get_dummies(df_user_info, prefix='age', columns=['age_range'])
df_train = df_train.merge(to_merge, on='user_id')
df_test = df_test.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given user 
to_merge = users.nunique().reset_index().rename(columns={
    'item_id': 'items', 
    'cat_id': 'categories',
    'merchant_id': 'merchants',
    'brand_id': 'brands',
    'time_stamp': 'dates',
    'action_type': 'action_types'
    })
df_train = df_train.merge(to_merge, on='user_id')
df_test = df_test.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given user and merchant
to_merge = users_merchants.nunique().reset_index().rename(columns={
    'item_id': 'items_user_merchant', 
    'cat_id': 'categories_user_merchant',
    'brand_id': 'brands_user_merchant',
    'time_stamp': 'dates_user_merchant',
    'action_type': 'action_types_user_merchant'
    })
df_train = df_train.merge(to_merge, on=['user_id', 'merchant_id'])
df_test = df_test.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# count total actions by type for a given user
to_merge = users['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user',
    1: 'carts_user',
    2: 'purchases_user',
    3: 'favourites_user'
    })
df_train = df_train.merge(to_merge, on='user_id')
df_test = df_test.merge(to_merge, on='user_id', how='left')

# count total actions by type for a given merchant
to_merge = merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_merchant', 
    1: 'carts_merchant',
    2: 'purchases_merchant',
    3: 'favourites_merchant'
    })
df_train = df_train.merge(to_merge, on='merchant_id')
df_test = df_test.merge(to_merge, on='merchant_id', how='left')

# count total actions by type for a given pair (user, merchant)
to_merge = users_merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user_merchant',
    1: 'carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'favourites_user_merchant'
    })
df_train = df_train.merge(to_merge, on=['user_id', 'merchant_id'])
df_test = df_test.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# ratio of actions in each merchant for a given user
df_train['clicks_in_merchant_ratio'] = df_train['clicks_user_merchant'] / (df_train['clicks_user'] + EPSILON)
df_train['carts_in_merchant_ratio'] = df_train['carts_user_merchant'] / (df_train['carts_user'] + EPSILON)
df_train['purchases_in_merchant_ratio'] = df_train['purchases_user_merchant'] / (df_train['purchases_user'] + EPSILON)
df_train['favourites_in_merchant_ratio'] = df_train['favourites_user_merchant'] / (df_train['favourites_user'] + EPSILON)

df_test['clicks_in_merchant_ratio'] = df_test['clicks_user_merchant'] / (df_test['clicks_user'] + EPSILON)
df_test['carts_in_merchant_ratio'] = df_test['carts_user_merchant'] / (df_test['carts_user'] + EPSILON)
df_test['purchases_in_merchant_ratio'] = df_test['purchases_user_merchant'] / (df_test['purchases_user'] + EPSILON)
df_test['favourites_in_merchant_ratio'] = df_test['favourites_user_merchant'] / (df_test['favourites_user'] + EPSILON)

# ratio of each action type for a given pair (user, merchant)
df_train['clicks_ratio'] = df_train['clicks_user_merchant'] / (df_train['clicks_user_merchant'] + df_train['carts_user_merchant'] + df_train['purchases_user_merchant'] + df_train['favourites_user_merchant'] + EPSILON)
df_train['carts_ratio'] = df_train['carts_user_merchant'] / (df_train['clicks_user_merchant'] + df_train['carts_user_merchant'] + df_train['purchases_user_merchant'] + df_train['favourites_user_merchant'] + EPSILON)
df_train['purchases_ratio'] = df_train['purchases_user_merchant'] / (df_train['clicks_user_merchant'] + df_train['carts_user_merchant'] + df_train['purchases_user_merchant'] + df_train['favourites_user_merchant'] + EPSILON)
df_train['favourites_ratio'] = df_train['favourites_user_merchant'] / (df_train['clicks_user_merchant'] + df_train['carts_user_merchant'] + df_train['purchases_user_merchant'] + df_train['favourites_user_merchant'] + EPSILON)

df_test['clicks_ratio'] = df_test['clicks_user_merchant'] / (df_test['clicks_user_merchant'] + df_test['carts_user_merchant'] + df_test['purchases_user_merchant'] + df_test['favourites_user_merchant'] + EPSILON)
df_test['carts_ratio'] = df_test['carts_user_merchant'] / (df_test['clicks_user_merchant'] + df_test['carts_user_merchant'] + df_test['purchases_user_merchant'] + df_test['favourites_user_merchant'] + EPSILON)
df_test['purchases_ratio'] = df_test['purchases_user_merchant'] / (df_test['clicks_user_merchant'] + df_test['carts_user_merchant'] + df_test['purchases_user_merchant'] + df_test['favourites_user_merchant'] + EPSILON)
df_test['favourites_ratio'] = df_test['favourites_user_merchant'] / (df_test['clicks_user_merchant'] + df_test['carts_user_merchant'] + df_test['purchases_user_merchant'] + df_test['favourites_user_merchant'] + EPSILON)

# double11 features
double11_day = 184
to_merge = (df_user_log[df_user_log['time_stamp'] == double11_day]).reset_index(drop=True)
to_merge = to_merge[to_merge['action_type'] == 2].reset_index(drop=True).groupby('user_id').size().reset_index()
df_train = df_train.merge(to_merge, on='user_id').rename(columns={0: 'double11_purchases'})
df_test = df_test.merge(to_merge, on='user_id', how='left').rename(columns={0: 'double11_purchases'})

df_train['double11_ratio'] = df_train['double11_purchases'] / df_train['purchases_user']
df_test['double11_ratio'] = df_test['double11_purchases'] / df_test['purchases_user']

# interval features
to_merge = (users['time_stamp'].max() - users['time_stamp'].min()).rename('interval')
df_train = df_train.merge(to_merge, on='user_id')
df_test = df_test.merge(to_merge, on='user_id', how='left')

# PCA features
pca = PCA(n_components=PCA_COMPONENTS)
pca.fit(df_train)
df_train = df_train.join(pd.DataFrame(pca.transform(df_train), index=df_train.index).add_prefix('pca_'))
pca.fit(df_test)
df_test = df_test.join(pd.DataFrame(pca.transform(df_test), index=df_test.index).add_prefix('pca_'))

In [15]:
X, y = df_train.drop(columns='label'), df_train['label']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=VALID_SET_SIZE, random_state=RANDOM_SEED)

In [19]:
X_train

Unnamed: 0,user_id,merchant_id,gender,age_0,age_1,age_2,age_3,age_4,age_5,age_6,...,purchases_ratio,favourites_ratio,double11_purchases,double11_ratio,interval,pca_0,pca_1,pca_2,pca_3,pca_4
13722,368333,2935,0,0,0,0,1,0,0,0,...,0.142857,0.00,1,0.500000,58,156439.956117,-42505.868683,-468.369732,-480.790671,-176.439013
45886,267410,152,1,0,0,0,1,0,0,0,...,0.250000,0.25,3,0.375000,180,55516.631360,-45647.665589,-1068.700125,2266.355466,54.199913
227878,360195,1677,0,1,0,0,0,0,0,0,...,0.333333,0.00,2,0.666667,72,148302.452404,-36601.342771,-997.071988,751.009307,-168.929978
196245,230560,2834,0,0,0,0,1,0,0,0,...,0.058824,0.00,4,0.210526,184,18667.780652,-33108.230253,-1696.963834,-448.509190,258.359744
188746,190854,1999,1,0,0,0,1,0,0,0,...,1.000000,0.00,1,0.500000,164,-21038.891894,-40576.435023,-412.777760,460.277370,-214.791120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,382405,4296,0,0,0,0,0,1,0,0,...,0.200000,0.00,1,0.142857,146,170512.621746,-35259.966705,-1489.472895,-1914.820720,-706.357871
103694,95544,1425,2,1,0,0,0,0,0,0,...,1.000000,0.00,1,0.200000,174,-116349.474203,-47091.192018,-1204.745274,975.049936,62.860210
131932,160416,4648,0,1,0,0,0,0,0,0,...,0.200000,0.00,1,0.500000,108,-51476.937093,-41601.644543,746.955579,-2110.414263,175.976621
146867,370774,798,0,0,0,0,0,1,0,0,...,0.142857,0.00,8,0.444444,176,158888.536443,44361.872124,3658.437193,2060.291578,48.875608


### **Models**

#### Model Setup

In [20]:
# TODO hyperparameter tuning
models = {
    'RandomForestClassifier': RandomForestClassifier(
        oob_score=True, 
        n_estimators=1000, 
        max_depth=10, 
        max_features='sqrt',
        n_jobs=-1
    ),
    'CatBoostClassifier': CatBoostClassifier(
        depth=6,
        learning_rate=0.05,
        iterations=1200,
        eval_metric='AUC',
        random_state=RANDOM_SEED,
        thread_count=8,
        silent=True
    ),
    'LGBMClassifier': LGBMClassifier(
        n_estimators=2000,
        max_depth=8,
        num_leaves=50,
        learning_rate=0.03,
        reg_lambda=1,
        objective='binary',
        metric=['auc'],
        random_state=RANDOM_SEED,
        n_jobs=-1
    ),
    'XGBClassifier': XGBClassifier(
        max_depth=7,
        n_estimators=1000,
        min_child_weight=200,
        colsample_bytree=0.8,
        subsample=0.8,
        eta=0.04,
        use_label_encoder=False,
        seed=RANDOM_SEED
    )
}

#### Training Parameters

In [21]:
params = {
    'RandomForestClassifier': {}, 
    'CatBoostClassifier': {},
    'LGBMClassifier': {},
    'XGBClassifier': {
        'eval_metric': 'auc',
        'eval_set': [(X_train.to_numpy(), y_train.to_numpy()), (X_valid.to_numpy(), y_valid.to_numpy())],
        'early_stopping_rounds': 400,
        'verbose': False
    }
}

#### Training

In [22]:
print('='*35 + ' ROC AUC Scores ' + '='*35)
for name, model in models.items():

    # cv = RepeatedStratifiedKFold(n_splits=SPLITS, n_repeats=REPEATS, random_state=0)
    # scores = cross_validate(model, X_train, y_train, scoring='roc_auc', cv=cv, return_estimator=True, n_jobs=-1)
    model.fit(X_train.to_numpy(), y_train.to_numpy(), **params[name])
    predictions = model.predict_proba(X_valid.to_numpy())[:,1]
    roc_auc = roc_auc_score(y_valid, predictions)

    print('% ' + name + ': ' + '{0:.4f}'.format(roc_auc))

% RandomForestClassifier: 0.6559
% CatBoostClassifier: 0.6769
% LGBMClassifier: 0.6669
% XGBClassifier: 0.6845


#### Submission

In [None]:
# prob_submission = model_xgb.predict_proba(df_test.to_numpy())[:,1]
# submission = df_test.iloc[:,:2].join(pd.DataFrame(prob_submission, index=df_test.index).rename(columns={0:'prob'}))
# submission.to_csv('submission.csv', index=False)