# **Big Data Intelligence Project - TMALL Repeat Buyers**
## Armando Fortes, David Pissarra, Gabriele Oliaro

### Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
import xgboost as xgb
import numpy as np
import pandas as pd

### Constants and Hyperparameters

In [2]:
DATA_DIR_1 = '../data_format1/'
DATA_DIR_2 = '../data_format2/'
TRAIN_PATH = DATA_DIR_1 + 'train_format1.csv'
TEST_PATH = DATA_DIR_1 + 'test_format1.csv'
USER_INFO_PATH = DATA_DIR_1 + 'user_info_format1.csv'
USER_LOG_PATH = DATA_DIR_1 + 'user_log_format1.csv'

In [3]:
VALID_SET_SIZE = 0.2
PCA_COMPONENTS = 5
RANDOM_SEED = 42
EPSILON = 1e-10

## **Data Pre-Processing**

### Read CSVs

In [4]:
df_train = pd.read_csv(TRAIN_PATH)
df_user_info = pd.read_csv(USER_INFO_PATH)
df_user_log = pd.read_csv(USER_LOG_PATH)
df_test = pd.read_csv(TEST_PATH)
df_test.drop('prob', axis=1, inplace=True)

df_train['kind'] = 'train'
df_test['kind'] = 'test'
df = df_train.append(df_test)

### Optimize memory usage and Data Cleaning

In [5]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

2.86 GB


In [6]:
df_user_log['user_id'] = df_user_log['user_id'].astype(np.int32)
df_user_log['item_id'] = df_user_log['item_id'].astype(np.int32)
df_user_log['cat_id'] = df_user_log['cat_id'].astype(np.int16)
df_user_log['seller_id'] = df_user_log['seller_id'].astype(np.int16)
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
df_user_log['brand_id'].fillna(0, inplace=True)
df_user_log['brand_id'] = df_user_log['brand_id'].astype(np.int16)
df_user_log['time_stamp'] = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format='%m%d')).dt.days
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype(np.int16)
df_user_log['action_type'] = df_user_log['action_type'].astype(np.int8)

In [7]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

0.87 GB


In [8]:
df_user_info['age_range'].fillna(0, inplace=True)
df_user_info['gender'].fillna(2, inplace=True)
df_user_info['age_range'] = df_user_info['age_range'].astype(np.int8)
df_user_info['gender'] = df_user_info['gender'].astype(np.int8)

### Feature Engineering

In [9]:
users = df_user_log.groupby('user_id')
merchants = df_user_log.groupby('merchant_id')
users_merchants = df_user_log.groupby(['user_id', 'merchant_id'])

In [10]:
# transform age categorical features into different binary features
to_merge = pd.get_dummies(df_user_info, prefix='age', columns=['age_range'])
df = df.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given user 
to_merge = users.nunique().reset_index().rename(columns={
    'item_id': 'items', 
    'cat_id': 'categories',
    'merchant_id': 'merchants',
    'brand_id': 'brands',
    'time_stamp': 'dates',
    'action_type': 'action_types'
    })
df = df.merge(to_merge, on='user_id', how='left')

# count total number of unique values from each feature for a given user and merchant
to_merge = users_merchants.nunique().reset_index().rename(columns={
    'item_id': 'items_user_merchant', 
    'cat_id': 'categories_user_merchant',
    'brand_id': 'brands_user_merchant',
    'time_stamp': 'dates_user_merchant',
    'action_type': 'action_types_user_merchant'
    })
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# count total actions by type for a given user
to_merge = users['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user',
    1: 'carts_user',
    2: 'purchases_user',
    3: 'favourites_user'
    })
df = df.merge(to_merge, on='user_id', how='left')

# count total actions by type for a given merchant
to_merge = merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_merchant', 
    1: 'carts_merchant',
    2: 'purchases_merchant',
    3: 'favourites_merchant'
    })
df = df.merge(to_merge, on='merchant_id', how='left')

# count total actions by type for a given pair (user, merchant)
to_merge = users_merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user_merchant',
    1: 'carts_user_merchant',
    2: 'purchases_user_merchant',
    3: 'favourites_user_merchant'
    })
df = df.merge(to_merge, on=['user_id', 'merchant_id'], how='left')

# ratio of actions in each merchant for a given user
df['clicks_in_merchant_ratio'] = df['clicks_user_merchant'] / (df['clicks_user'] + EPSILON)
df['carts_in_merchant_ratio'] = df['carts_user_merchant'] / (df['carts_user'] + EPSILON)
df['purchases_in_merchant_ratio'] = df['purchases_user_merchant'] / (df['purchases_user'] + EPSILON)
df['favourites_in_merchant_ratio'] = df['favourites_user_merchant'] / (df['favourites_user'] + EPSILON)

# ratio of each action type for a given pair (user, merchant)
df['clicks_ratio'] = df['clicks_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['carts_ratio'] = df['carts_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['purchases_ratio'] = df['purchases_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)
df['favourites_ratio'] = df['favourites_user_merchant'] / (df['clicks_user_merchant'] + df['carts_user_merchant'] + df['purchases_user_merchant'] + df['favourites_user_merchant'] + EPSILON)

# double11 features
double11_day = 184
to_merge = (df_user_log[df_user_log['time_stamp'] == double11_day]).reset_index(drop=True)
to_merge = to_merge[to_merge['action_type'] == 2].reset_index(drop=True).groupby('user_id').size().reset_index()
df = df.merge(to_merge, on='user_id', how='left').rename(columns={0: 'double11_purchases'})

df['double11_ratio'] = df['double11_purchases'] / df['purchases_user']

# interval features
to_merge = (users['time_stamp'].max() - users['time_stamp'].min()).rename('interval')
df = df.merge(to_merge, on='user_id', how='left')

### Max, mean, standard deviation and median on merchant-user actions (grouping by merchants)

In [11]:
clmns = ['clicks_user_merchant', 'carts_user_merchant', 'purchases_user_merchant', 'favourites_user_merchant']

to_merge = df.groupby('merchant_id')[clmns].max().rename(columns={
            'clicks_user_merchant': 'clicks_merchant_user_max',
            'carts_user_merchant': 'carts_merchant_user_max',
            'purchases_user_merchant': 'purchases_merchant_user_max',
            'favourites_user_merchant': 'favourites_merchant_user_max'
})
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].mean().rename(columns={
            'clicks_user_merchant': 'clicks_merchant_user_mean',
            'carts_user_merchant': 'carts_merchant_user_mean',
            'purchases_user_merchant': 'purchases_merchant_user_mean',
            'favourites_user_merchant': 'favourites_merchant_user_mean'
})
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].std().rename(columns={
            'clicks_user_merchant': 'clicks_merchant_user_std',
            'carts_user_merchant': 'carts_merchant_user_std',
            'purchases_user_merchant': 'purchases_merchant_user_std',
            'favourites_user_merchant': 'favourites_merchant_user_std'
}).fillna(0)
df = df.merge(to_merge, on='merchant_id', how='left')

to_merge = df.groupby('merchant_id')[clmns].median().rename(columns={
            'clicks_user_merchant': 'clicks_merchant_user_median',
            'carts_user_merchant': 'carts_merchant_user_median',
            'purchases_user_merchant': 'purchases_merchant_user_median',
            'favourites_user_merchant': 'favourites_merchant_user_median'
})
df = df.merge(to_merge, on='merchant_id', how='left')

### Max, mean, standard deviation and median on merchant-user actions (grouping by users)

In [12]:
to_merge = df.groupby('user_id')[clmns].max().rename(columns={
            'clicks_user_merchant': 'clicks_user_merchant_max',
            'carts_user_merchant': 'carts_user_merchant_max',
            'purchases_user_merchant': 'purchases_user_merchant_max',
            'favourites_user_merchant': 'favourites_user_merchant_max'
})
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].mean().rename(columns={
            'clicks_user_merchant': 'clicks_user_merchant_mean',
            'carts_user_merchant': 'carts_user_merchant_mean',
            'purchases_user_merchant': 'purchases_user_merchant_mean',
            'favourites_user_merchant': 'favourites_user_merchant_mean'
})
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].std().rename(columns={
            'clicks_user_merchant': 'clicks_user_merchant_std',
            'carts_user_merchant': 'carts_user_merchant_std',
            'purchases_user_merchant': 'purchases_user_merchant_std',
            'favourites_user_merchant': 'favourites_user_merchant_std'
}).fillna(0)
df = df.merge(to_merge, on='user_id', how='left')

to_merge = df.groupby('user_id')[clmns].median().rename(columns={
            'clicks_user_merchant': 'clicks_user_merchant_median',
            'carts_user_merchant': 'carts_user_merchant_median',
            'purchases_user_merchant': 'purchases_user_merchant_median',
            'favourites_user_merchant': 'favourites_user_merchant_median'
})
df = df.merge(to_merge, on='user_id', how='left')

### PCA Features

In [13]:
# PCA features
pca_df = df.drop(['kind', 'label'], axis=1)
pca = PCA(n_components=PCA_COMPONENTS)
pca.fit(pca_df)
df = df.join(pd.DataFrame(pca.transform(pca_df), index=pca_df.index).add_prefix('pca_'))

## Reorganizing and splitting data

In [14]:
df_train = df[df['kind'] == 'train'].drop('kind', axis=1)
df_test = df[df['kind'] == 'test'].drop(['kind', 'label'], axis=1)

X, y = df_train.drop(columns='label'), df_train['label']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=VALID_SET_SIZE, random_state=RANDOM_SEED)

In [15]:
X_train

Unnamed: 0,user_id,merchant_id,gender,age_0,age_1,age_2,age_3,age_4,age_5,age_6,...,favourites_user_merchant_std,clicks_user_merchant_median,carts_user_merchant_median,purchases_user_merchant_median,favourites_user_merchant_median,pca_0,pca_1,pca_2,pca_3,pca_4
13722,63912,542,0,1,0,0,0,0,0,0,...,0.0,7.0,0.0,1.0,0.0,-148433.345523,-50146.982692,-1261.093236,1851.176579,-86.175769
45886,199431,3844,1,1,0,0,0,0,0,0,...,0.0,11.0,0.0,1.0,0.0,-12852.218036,-41344.661873,-868.940036,-1418.404343,-119.627877
227878,37919,3335,0,1,0,0,0,0,0,0,...,0.0,3.0,0.0,1.0,0.0,-174416.754094,-48629.517763,-1131.962583,-933.832507,62.528662
196245,212162,120,0,0,0,0,0,0,1,0,...,0.0,3.0,0.0,1.0,0.0,-155.001757,-46423.406113,-782.319476,2317.582217,-14.655348
188746,380333,1203,0,0,0,0,0,0,1,0,...,0.0,6.0,0.0,1.0,0.0,168226.211050,-15674.419167,1370.096230,1397.988159,-648.984034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,295547,4655,0,0,0,0,1,0,0,0,...,0.0,34.0,0.0,1.0,0.0,83261.399934,-42019.828060,-1280.884078,-2252.894837,64.460273
103694,5809,4474,0,1,0,0,0,0,0,0,...,0.0,5.0,0.0,1.0,0.0,-206527.836909,-48683.463468,-1020.264432,-2064.441329,108.646286
131932,124933,2669,0,0,0,0,0,0,0,0,...,0.0,3.0,0.0,1.0,0.0,-87138.309848,-9526.412769,514.449475,-106.728900,-55.702251
146867,384049,3971,1,0,0,0,1,0,0,0,...,0.0,3.0,0.0,1.0,0.0,171852.108116,-29107.059212,566.845196,-1425.230217,246.570616


## **Models**

### Fetch best features
The xgboost object will train the model with all features, then the booster object (returned after training) can calculate which features best contribute for most information gain.

In [18]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
watchlist = [(dvalid, 'valid')]
params = {
    'max_depth': 7,
    'min_child_weight': 200, 
    'colsample_bytree': 0.8, 
    'subsample': 0.8, 
    'eta': 0.04,    
    'seed': RANDOM_SEED,
    'eval_metric': 'auc'
}
booster = xgb.train(params, dtrain, num_boost_round=2000, evals=watchlist, early_stopping_rounds=400)
best_features = pd.DataFrame(booster.get_score(importance_type='gain').items(), columns=['features', 'importance'])['features'].to_numpy()
opt_X_train, opt_X_valid = X_train[best_features], X_valid[best_features]

[0]	valid-auc:0.62959
[1]	valid-auc:0.64106
[2]	valid-auc:0.64511
[3]	valid-auc:0.64587
[4]	valid-auc:0.65013
[5]	valid-auc:0.65195
[6]	valid-auc:0.65367
[7]	valid-auc:0.65426
[8]	valid-auc:0.65437
[9]	valid-auc:0.65563
[10]	valid-auc:0.65667
[11]	valid-auc:0.65955
[12]	valid-auc:0.66207
[13]	valid-auc:0.66218
[14]	valid-auc:0.66278
[15]	valid-auc:0.66268
[16]	valid-auc:0.66330
[17]	valid-auc:0.66490
[18]	valid-auc:0.66568
[19]	valid-auc:0.66660
[20]	valid-auc:0.66716
[21]	valid-auc:0.66735
[22]	valid-auc:0.66797
[23]	valid-auc:0.66829
[24]	valid-auc:0.66912
[25]	valid-auc:0.66957
[26]	valid-auc:0.66977
[27]	valid-auc:0.66995
[28]	valid-auc:0.66994
[29]	valid-auc:0.67033
[30]	valid-auc:0.67032
[31]	valid-auc:0.67056
[32]	valid-auc:0.67094
[33]	valid-auc:0.67126
[34]	valid-auc:0.67171
[35]	valid-auc:0.67201
[36]	valid-auc:0.67254
[37]	valid-auc:0.67320
[38]	valid-auc:0.67319
[39]	valid-auc:0.67331
[40]	valid-auc:0.67326
[41]	valid-auc:0.67340
[42]	valid-auc:0.67342
[43]	valid-auc:0.6733

### Model Setup

In [19]:
# TODO hyperparameter tuning
models = {
    'RandomForestClassifier': RandomForestClassifier(
        oob_score=True, 
        n_estimators=1000, 
        max_depth=10, 
        max_features='sqrt',
        n_jobs=-1
    ),
    'CatBoostClassifier': CatBoostClassifier(
        depth=6,
        learning_rate=0.05,
        iterations=1200,
        eval_metric='AUC',
        random_state=RANDOM_SEED,
        thread_count=8,
        silent=True
    ),
    'LGBMClassifier': LGBMClassifier(
        n_estimators=2000,
        max_depth=8,
        num_leaves=50,
        learning_rate=0.03,
        reg_lambda=1,
        objective='binary',
        metric=['auc'],
        random_state=RANDOM_SEED,
        n_jobs=-1
    ),
    'XGBClassifier': xgb.XGBClassifier(
        max_depth=7,
        n_estimators=1000,
        min_child_weight=200,
        colsample_bytree=0.8,
        subsample=0.8,
        eta=0.04,
        use_label_encoder=False,
        seed=RANDOM_SEED
    )
}

### Training Parameters (only including the best features)

In [20]:
params = {
    'RandomForestClassifier': {}, 
    'CatBoostClassifier': {},
    'LGBMClassifier': {},
    'XGBClassifier': {
        'eval_metric': 'auc',
        'eval_set': [(opt_X_train.to_numpy(), y_train.to_numpy()), (opt_X_valid.to_numpy(), y_valid.to_numpy())],
        'early_stopping_rounds': 400,
        'verbose': False
    }
}

### Training

In [21]:
print('='*35 + ' ROC AUC Scores ' + '='*35)
for name, model in models.items():

    # cv = RepeatedStratifiedKFold(n_splits=SPLITS, n_repeats=REPEATS, random_state=0)
    # scores = cross_validate(model, X_train, y_train, scoring='roc_auc', cv=cv, return_estimator=True, n_jobs=-1)
    model.fit(opt_X_train.to_numpy(), y_train.to_numpy(), **params[name])
    predictions = model.predict_proba(opt_X_valid.to_numpy())[:,1]
    roc_auc = roc_auc_score(y_valid, predictions)

    print('% ' + name + ': ' + '{0:.4f}'.format(roc_auc))

% RandomForestClassifier: 0.6660
% CatBoostClassifier: 0.6850
% LGBMClassifier: 0.6728
% XGBClassifier: 0.6899


### Submission

In [24]:
prob_submission = model.predict_proba(df_test[best_features].to_numpy())[:,1]
submission = df_test.iloc[:,:2].join(pd.DataFrame(prob_submission, index=df_test.index).rename(columns={0:'prob'}))
submission.to_csv('submission.csv', index=False)