In [1]:
# !pip -q install git+https://github.com/eaedk/testing-zindi-package.git
# from zindi.user import Zindian
# USERNAME = "adetoromichael346@gmail.com" #@param {type : "string"}
# user = Zindian(username=USERNAME)
# user.select_a_challenge(reward='all', kind='competition', active='true')
# user.download_dataset(destination="dataset")

In [2]:
# dependencies
import re
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from catboost import Pool, CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans
# from collections import Counter
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE, RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
extra = pd.read_csv('extra_data.csv')
sub = pd.read_csv('SampleSubmission.csv')
random_seed = 101
train['train'] = 0
test['train'] = 1
data = pd.concat([train, test], sort = False).reset_index(drop = True)

In [4]:
def process(data):
    time_cols = ['MERCHANT_CATEGORIZED_AT',  'PURCHASED_AT']
    for i, col in enumerate(time_cols):
        data[f'created_at_{i}'] = pd.to_datetime(data[col])
        data[f'hour_{i}'] = data[f'created_at_{i}'].dt.hour
        data[f'minute_{i}'] = data[f'created_at_{i}'].dt.minute
    data['hour'] = data.hour_0 - data.hour_1
    data['minute'] = data.minute_0 - data.minute_1
    data['period'] = pd.cut(data['hour'], bins = [0, 4, 8, 12, 16, 20, 23], labels=['Midnight', 'Early_Morning', 'Late_Morning', 'Afternoon', 'Evening', 'Night'])
    data['month'] = data['created_at_0'].dt.month
    data['day'] = data['created_at_0'].dt.day
    data['dayofweek'] = data['created_at_0'].dt.dayofweek
    data['delta_period'] = (data['created_at_0'] - data['created_at_1']).dt.total_seconds()/(60*60*24)
    data['month_day'] = data['month'].astype(str) + '-' + data['day'].astype(str)
    data.drop(['MERCHANT_CATEGORIZED_AT',  'PURCHASED_AT', 'created_at_0', 'created_at_1', 'day'], 1, inplace = True)

    freq_coll = ['USER_ID', 'month_day']
    for col in freq_coll:
        data[col] = data[col].map(data.groupby(col).size() / len(data))

    le = LabelEncoder()
    LE_cols = ['MERCHANT_NAME', 'IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY']
    for le_col in LE_cols:
        data[le_col] = le.fit_transform(data[le_col])

    cols2dum = ['USER_GENDER', 'period']
    data = pd.get_dummies(data, prefix_sep = '_', columns = cols2dum)

    for agg in (['mean']):
    # add more aggregation parameters if you want
        data['date_by_name_' + agg] = data['MERCHANT_NAME'].groupby(data['month_day']).transform(agg)
        data['name_by_purchase_' + agg] = data['PURCHASE_VALUE'].groupby(data['MERCHANT_NAME']).transform(agg)
        data['name_by_income_' + agg] = data['USER_INCOME'].groupby(data['MERCHANT_NAME']).transform(agg)
    data['income_per_household'] = data.USER_INCOME	/ data.USER_HOUSEHOLD

    data.drop(['Transaction_ID', 'USER_AGE', 'period_Night'], 1, inplace = True)
    data_km = data.drop(['MERCHANT_CATEGORIZED_AS'], axis = 1)
    km = KMeans(n_clusters = 13, random_state = random_seed)
    data['cluster'] = km.fit_predict(data_km)
    return data

In [5]:
data = process(data)
train = data[data.train == 0]
test = data[data.train == 1]

In [6]:
test.drop(['MERCHANT_CATEGORIZED_AS', 'train'], axis = 1, inplace = True)
train.drop('train', axis = 1, inplace = True)
print('shape', train.shape, test.shape)

shape (373, 29) (558, 28)


In [17]:
X_, y_ = train.drop('MERCHANT_CATEGORIZED_AS', axis = 1), train["MERCHANT_CATEGORIZED_AS"]
# X, y = SMOTE(k_neighbors = 1, random_state = 1).fit_resample(X_, y_)
X, y = RandomOverSampler(random_state = 1).fit_resample(X_, y_)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, random_state = random_seed)
X_.shape, y_.shape

((373, 28), (373,))

In [8]:
cb_model_ = CatBoostClassifier(l2_leaf_reg = 9.441413522475084, depth = 5, bootstrap_type = 'MVS', learning_rate = 0.01712339213540557, n_estimators = 2950,
                                                 leaf_estimation_iterations = 1, random_strength = 0.18095032711212016, loss_function = 'MultiClass', verbose = 0, random_state = random_seed)
# cb_model_.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose = 0, early_stopping_rounds = 200)
# preds_ = cb_model_.predict_proba(X_test)
# print("Model training finished")
# log_loss(y_test, preds_).round(5)
cb_model_.fit(X_, y_)
print("Model training finished")

Model training finished


In [14]:
# feat_imp_df = pd.DataFrame(cb_model_.feature_importances_, columns = ['Importance'])
# feat_imp_df['Features'] = X_.columns
# feat_imp_df.sort_values(by = 'Importance', ascending = False)
# # print(feat_imp_df[feat_imp_df.Importance < 1].Features.to_list(), end = " ")

In [18]:
kf = StratifiedKFold(n_splits = 5, random_state = 1, shuffle = True)
cat_scores = []
for fold, (tr_in, te_in) in enumerate(kf.split(X, y)):
    print("= Fold {fold} =".format(fold = fold + 1))
    X_train, X_test = X.iloc[tr_in], X.iloc[te_in]
    y_train, y_test = y.iloc[tr_in], y.iloc[te_in]
    _cb_model_ = CatBoostClassifier(depth = 7, bootstrap_type = 'Bayesian', learning_rate = 0.1, n_estimators = 1000, use_best_model = True,
                              leaf_estimation_iterations = 1, random_strength = 0.2, loss_function = 'MultiClass', verbose = 0, random_state = random_seed)
    _cb_model_.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose = 0, early_stopping_rounds = 200)
    preds_ = _cb_model_.predict_proba(X_test)
    print(log_loss(y_test, preds_).round(5))
    cat_scores.append(log_loss(y_test, preds_).round(5))

print('cat_score: ', np.mean(cat_scores))

= Fold 1 =
0.22323
= Fold 2 =
0.34167
= Fold 3 =
0.21406
= Fold 4 =
0.29925
= Fold 5 =
0.31891
cat_score:  0.279424


In [19]:
def predict_and_submit(test_, filename):
    d = {"Transaction_ID": sub["Transaction_ID"], 'Bills & Fees':test_[:, 0], 'Data & WiFi':test_[:, 1], 'Education':test_[:, 2], 'Emergency fund':test_[:, 3],'Family & Friends':test_[:, 4],'Going out':test_[:, 5],'Groceries':test_[:, 6],\
        'Health':test_[:, 7],'Loan Repayment':test_[:, 8],'Miscellaneous':test_[:, 9],'Rent / Mortgage':test_[:, 10],'Shopping':test_[:, 11],'Transport & Fuel':test_[:, 12]}
    df_ = pd.DataFrame(data = d)
    df_ = df_[["Transaction_ID", 'Bills & Fees','Data & WiFi','Education','Emergency fund','Family & Friends','Going out','Groceries','Health','Loan Repayment','Miscellaneous','Rent / Mortgage','Shopping','Transport & Fuel']]
    df_.to_csv(f'{filename}.csv', index = False)
    return df_.shape

In [23]:
y_a = cb_model_.predict_proba(test)
y_b = _cb_model_.predict_proba(test)
pred1 = y_a * 0.5 + y_b * 0.5
predict_and_submit(pred1, '_Manager')

(558, 14)