# Big Data Intelligence Project
## TMall Repeat Buyers Prediction

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
import pandas as pd
import xgboost as xgb

### Data Pre-Processing

#### Read CSVs

In [2]:
df_train = pd.read_csv('data_format1/train_format1.csv')

df_test = pd.read_csv('data_format1/test_format1.csv')
df_test.drop('prob', axis=1, inplace=True)

df_user_info = pd.read_csv('data_format1/user_info_format1.csv')

In [3]:
df_user_log = pd.read_csv('data_format1/user_log_format1.csv')

#### Optimize memory usage and Data Cleaning

In [4]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

2.86 GB


In [5]:
df_user_log['user_id'] = df_user_log['user_id'].astype(np.int32)
df_user_log['item_id'] = df_user_log['item_id'].astype(np.int32)
df_user_log['cat_id'] = df_user_log['cat_id'].astype(np.int16)
df_user_log['seller_id'] = df_user_log['seller_id'].astype(np.int16)
df_user_log.rename(columns={'seller_id' : 'merchant_id'}, inplace=True)
df_user_log['brand_id'].fillna(0, inplace=True)
df_user_log['brand_id'] = df_user_log['brand_id'].astype(np.int16)
#df_user_log['time_stamp'] = (pd.to_datetime(df_user_log['time_stamp'], format='%m%d') - pd.to_datetime(df_user_log['time_stamp'].min(), format='%m%d')).dt.days
df_user_log['time_stamp'] = df_user_log['time_stamp'].astype(np.int16)
df_user_log['action_type'] = df_user_log['action_type'].astype(np.int8)

In [6]:
print(f'{round(df_user_log.memory_usage().sum() / 2**30, 2)} GB')

0.87 GB


In [7]:
df_user_info['age_range'].fillna(0, inplace=True)
df_user_info['gender'].fillna(2, inplace=True)
df_user_info['age_range'] = df_user_info['age_range'].astype(np.int8)
df_user_info['gender'] = df_user_info['gender'].astype(np.int8)

Add features

In [8]:
users = df_user_log.groupby('user_id')
users_merchants = df_user_log.groupby(['user_id', 'merchant_id'])

In [9]:
df_train = df_train.merge(pd.get_dummies(df_user_info, prefix='age', columns=['age_range']), on='user_id')

count_unique = df_user_log.groupby('user_id').nunique().reset_index().rename(columns={
    'item_id': 'items', 
    'cat_id': 'categories',
    'merchant_id': 'merchants',
    'brand_id': 'brands',
    'time_stamp': 'dates',
    'action_type': 'action_types'
    })
df_train = df_train.merge(count_unique)

actions_user = users['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_user',
    1: 'carts_user',
    2: 'purchases_user',
    3: 'favourites_user'
    })
df_train = df_train.merge(actions_user, on='user_id')

actions_merchant = users_merchants['action_type'].value_counts().unstack(fill_value=0).rename(columns={
    0: 'clicks_merchant', 
    1: 'carts_merchant',
    2: 'purchases_merchant',
    3: 'favourites_merchant'
    })
df_train = df_train.merge(actions_merchant, on=['user_id', 'merchant_id'])

df_train['clicks_ratio'] = df_train['clicks_merchant'] / (df_train['purchases_merchant'] + df_train['clicks_merchant'] + df_train['carts_merchant'] + df_train['favourites_merchant'])

In [56]:
X, y = df_train.drop(columns='label'), df_train['label']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [58]:
X_train

Unnamed: 0,user_id,merchant_id,gender,age_0,age_1,age_2,age_3,age_4,age_5,age_6,...,carts_merchant,purchases_merchant,favourites_merchant,clicks_ratio,items,categories,merchants,brands,dates,action_types
197883,123335,3392,1,0,0,0,1,0,0,0,...,0,1,0,0.750000,10,5,5,6,6,2
169721,72820,474,1,0,0,0,0,1,0,0,...,0,1,0,0.666667,104,34,36,37,17,3
149495,233785,4269,1,0,0,0,1,0,0,0,...,0,1,0,0.000000,2,2,2,2,2,2
246223,415316,3734,1,0,0,0,1,0,0,0,...,0,1,1,0.600000,14,6,12,11,5,3
61563,3381,4287,2,1,0,0,0,0,0,0,...,0,1,0,0.500000,14,7,14,13,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,295547,4655,0,0,0,0,1,0,0,0,...,0,1,0,0.971429,64,27,21,20,15,3
103694,5809,4474,0,1,0,0,0,0,0,0,...,0,1,0,0.833333,502,62,189,187,61,3
131932,124933,2669,0,0,0,0,0,0,0,0,...,0,1,0,0.500000,64,23,44,44,25,2
146867,384049,3971,1,0,0,0,1,0,0,0,...,0,1,0,0.750000,58,20,43,35,20,3


In [59]:
rfc = RandomForestClassifier(
    oob_score=True, 
    n_jobs=-1, 
    n_estimators=1000, 
    max_depth=10, 
    max_features='sqrt')

rfc.fit(X_train.to_numpy(), y_train.to_numpy())

RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=1000,
                       n_jobs=-1, oob_score=True)

In [60]:
prob = rfc.predict_proba(X_test.to_numpy())[:,1]

In [61]:
round(metrics.roc_auc_score(y_test, prob), 3)

0.627

In [62]:
model_xgb = xgb.XGBClassifier(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42,
    use_label_encoder=False
)

model_xgb.fit(
    X_train.to_numpy(), 
    y_train.to_numpy(),
    eval_metric='auc',
    eval_set=[(X_train.to_numpy(), y_train.to_numpy()), (X_test.to_numpy(), y_test.to_numpy())],
    verbose=True,
    early_stopping_rounds=10
)

[0]	validation_0-auc:0.61234	validation_1-auc:0.61318
[1]	validation_0-auc:0.62305	validation_1-auc:0.61519
[2]	validation_0-auc:0.62914	validation_1-auc:0.61839
[3]	validation_0-auc:0.63389	validation_1-auc:0.61930
[4]	validation_0-auc:0.63537	validation_1-auc:0.62090
[5]	validation_0-auc:0.63661	validation_1-auc:0.62195
[6]	validation_0-auc:0.63733	validation_1-auc:0.62305
[7]	validation_0-auc:0.63814	validation_1-auc:0.62332
[8]	validation_0-auc:0.63850	validation_1-auc:0.62359
[9]	validation_0-auc:0.63891	validation_1-auc:0.62421
[10]	validation_0-auc:0.63957	validation_1-auc:0.62546
[11]	validation_0-auc:0.64237	validation_1-auc:0.62882
[12]	validation_0-auc:0.64268	validation_1-auc:0.62863
[13]	validation_0-auc:0.64484	validation_1-auc:0.63010
[14]	validation_0-auc:0.64567	validation_1-auc:0.63029
[15]	validation_0-auc:0.64691	validation_1-auc:0.63073
[16]	validation_0-auc:0.64883	validation_1-auc:0.63309
[17]	validation_0-auc:0.64988	validation_1-auc:0.63352
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=300, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, num_parallel_tree=1,
              predictor='auto', random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=42, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)

In [63]:
prob = model_xgb.predict_proba(X_test.to_numpy())[:,1]

In [64]:
round(metrics.roc_auc_score(y_test, prob), 3)

0.64