In [1]:
# !pip -q install git+https://github.com/eaedk/testing-zindi-package.git
# from zindi.user import Zindian
# USERNAME = "adetoromichael346@gmail.com" #@param {type : "string"}
# user = Zindian(username=USERNAME)
# user.select_a_challenge(reward='all', kind='competition', active='true')
# user.download_dataset(destination="dataset")

In [270]:
# dependencies
import re
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from catboost import Pool, CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

In [271]:
Train = pd.read_csv('Train.csv')
Test = pd.read_csv('Test.csv')
extra = pd.read_csv('extra_data.csv')
sub = pd.read_csv('SampleSubmission.csv')
random_seed = 42 # random seed for all computations

data = pd.concat([Train, Test], sort = False).reset_index(drop = True)

In [272]:
data['PURCHASED_AT'] = pd.to_datetime(data['PURCHASED_AT'])
data['MERCHANT_CATEGORIZED_AT'] = pd.to_datetime(data['MERCHANT_CATEGORIZED_AT'])
data['year'] = data['PURCHASED_AT'].dt.year
data['month'] = data['PURCHASED_AT'].dt.month
data['day'] = data['PURCHASED_AT'].dt.day
data['hour'] = data['PURCHASED_AT'].dt.hour
data['minute'] = data['PURCHASED_AT'].dt.minute
data['second'] = data['PURCHASED_AT'].dt.second
data['dayofweek'] = data['PURCHASED_AT'].dt.dayofweek
data['ismonthend'] = data['PURCHASED_AT'].dt.is_month_end
data['ismonthstart'] = data['PURCHASED_AT'].dt.is_month_start
data['period'] = pd.cut(data['hour'], bins = [0.0, 5.0, 11.0, 17.0, 23.0], labels=['Midnight', 'Morning', 'Afternoon', 'Evening'])
data['month_day'] = data['month'].astype(str) + '-' + data['day'].astype(str)
# data['delta_period'] = (data['PURCHASED_AT'] - data['MERCHANT_CATEGORIZED_AT']).dt.total_seconds()/(60*60*24)

freq_coll = ['MERCHANT_NAME']
for col in freq_coll:
    data[col] = data[col].map(data.groupby(col).size() / len(data))

le = LabelEncoder()
LE_cols = ['USER_ID', 'IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY', 'month_day']
for le_col in LE_cols:
    data[le_col] = le.fit_transform(data[le_col])

cols2dum = ['period', 'ismonthend', 'ismonthstart', 'USER_GENDER']
data = pd.get_dummies(data, prefix_sep = '_', columns = cols2dum)

# data['_date_by_purchase'] = data['PURCHASE_VALUE'].groupby(data['month_day']).transform('mean')
data['income_by_name'] = data['MERCHANT_NAME'].groupby(data['USER_ID']).transform('mean')
data['name_by_purchase'] = data['PURCHASE_VALUE'].groupby(data['MERCHANT_NAME']).transform('mean')
data['name_by_income'] = data['USER_INCOME'].groupby(data['MERCHANT_NAME']).transform('mean')
data['income_per_household'] = data.USER_INCOME	/ data.USER_HOUSEHOLD

data.drop(['Transaction_ID', 'USER_AGE', 'MERCHANT_CATEGORIZED_AT', 'PURCHASED_AT', 'USER_HOUSEHOLD', 'day'], 1, inplace = True)
data_km = data.drop(['MERCHANT_CATEGORIZED_AS'], axis = 1)
km = KMeans(n_clusters = 13, random_state = random_seed)
data['cluster'] = km.fit_predict(data_km)

In [273]:
# data.MERCHANT_CATEGORIZED_AS = le.fit_transform(data.MERCHANT_CATEGORIZED_AS)
# data.corr()['MERCHANT_CATEGORIZED_AS'][:]

In [274]:
# separating data into train and test
train = data[data.MERCHANT_CATEGORIZED_AS.notnull()].reset_index(drop = True)
test = data[data.MERCHANT_CATEGORIZED_AS.isna()].reset_index(drop = True)
test.drop('MERCHANT_CATEGORIZED_AS', axis = 1, inplace = True)
print('shape', train.shape, test.shape)

shape (373, 28) (558, 27)


In [275]:
X, y = train.drop('MERCHANT_CATEGORIZED_AS', axis = 1), train["MERCHANT_CATEGORIZED_AS"]

In [276]:
cb_model = CatBoostClassifier(l2_leaf_reg = 9.441413522475084, depth = 7, bootstrap_type = 'Bayesian', learning_rate = 0.01772339213540557, n_estimators = 3167,
                                                 leaf_estimation_iterations = 1, random_strength = 0.17095032711212016, loss_function = 'MultiClass', verbose = 0, random_state = random_seed)
cb_model.fit(X, y)
print("training finished")

<catboost.core.CatBoostClassifier at 0x2356f104dc0>

In [282]:
TARGET_COL = 'MERCHANT_CATEGORIZED_AS'
remove_features = ['MERCHANT_CATEGORIZED_AS', 'folds']
features_columns = [col for col in train.columns if col not in remove_features]
cat_ = le.fit_transform(train.MERCHANT_CATEGORIZED_AS)
def create_folds(data):
    data["folds"] = -1
    data = data.sample(frac = 1).reset_index(drop = True)
    num_bins = np.floor(1 + np.log2(len(train))).astype(int)
    data.loc[:, "bins"] = pd.cut(cat_, bins = num_bins, labels = False)
    # kf = StratifiedShuffleSplit(n_splits = 5)
    kf = StratifiedKFold(n_splits = 5)
    # kf = KFold(n_splits = 5)
    for f, (t_, v_) in enumerate(kf.split(X = data, y = data.bins.values)):
        data.loc[v_, "folds"] = f
    data.drop("bins", axis = 1, inplace = True)
    return data
train = create_folds(train)

In [278]:
print("-" * 20)
for fold in range(5):
    x_train_, y_train_ = train[train['folds']!=fold][features_columns] , train[train['folds']!=fold][TARGET_COL] 
    x_test_, y_test_ = train[train['folds']==fold][features_columns] , train[train['folds']==fold][TARGET_COL]
    model_ = CatBoostClassifier(loss_function = 'MultiClass', verbose = 0, random_state = random_seed)
    model_.fit(x_train_, y_train_)
    print("fold ", fold + 1, " finished")
    print("-" * 20)

--------------------
fold  1  finished
--------------------
fold  2  finished
--------------------
fold  3  finished
--------------------
fold  4  finished
--------------------
fold  5  finished
--------------------


In [279]:
def predict_and_submit(test_, filename):
    d = {"Transaction_ID": sub["Transaction_ID"], 'Bills & Fees':test_[:, 0], 'Data & WiFi':test_[:, 1], 'Education':test_[:, 2], 'Emergency fund':test_[:, 3],'Family & Friends':test_[:, 4],'Going out':test_[:, 5],'Groceries':test_[:, 6],\
        'Health':test_[:, 7],'Loan Repayment':test_[:, 8],'Miscellaneous':test_[:, 9],'Rent / Mortgage':test_[:, 10],'Shopping':test_[:, 11],'Transport & Fuel':test_[:, 12]}
    df_ = pd.DataFrame(data=d)
    df_ = df_[["Transaction_ID", 'Bills & Fees','Data & WiFi','Education','Emergency fund','Family & Friends','Going out','Groceries','Health','Loan Repayment','Miscellaneous','Rent / Mortgage','Shopping','Transport & Fuel']]
    df_.to_csv(f'{filename}.csv', index = False)
    return df_.shape

In [281]:
y_a = model_.predict_proba(test)
predict_and_submit(y_a, 'Manager_1.024')

(558, 14)