In [None]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
#!pip install catboost
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#!pip install shap
import shap


from hyperopt import fmin, hp, tpe

import matplotlib.pyplot as plt
import seaborn as sns


Collecting shap
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[K     |████████████████████████████████| 564 kB 4.2 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.40.0 slicer-0.0.7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Ensemble Learning/Tweets/all_features.csv', index_col=0)
train_df = train_df.sample(frac=1,random_state=1).reset_index(drop=True)

train_x = list(train_df.drop(['tweet_text','cyberbullying_type'], axis=1).columns)
train_x
train_df.head()

Unnamed: 0,tweet_text,cyberbullying_type,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,nb_len_1,nb_len_2,...,muslim,gay,round,good,radical,bad,mkr,rape,stupid,lot
0,"FYI the phrase is ""out your RABID mind"" dumb f...",ethnicity,85,16,2,14,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,U are Still scared to call out Jihad Still sca...,religion,283,54,2,42,8,0,7,8,...,0,0,0,0,0,0,0,0,0,0
2,"You can read Gibbon on Rome, Thucydides on Ath...",religion,284,50,0,36,12,0,2,9,...,1,0,0,0,0,0,0,0,0,0
3,I was never bullied. But in gr11 the popular g...,age,277,57,3,50,3,0,6,17,...,0,0,0,1,0,0,0,0,0,0
4,My dream is for one of the girls who bullied m...,age,274,59,0,55,4,0,1,14,...,0,0,0,0,0,0,0,0,0,0


#CatBoost


In [None]:
def CV_catboost(X_train,y_train,X_test,y_test):

    y_train = np.transpose(np.array(y_train)).ravel()
    y_test = np.transpose(np.array(y_test)).ravel()

    # Training
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         # cat_features=cat_features
                         )

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        # cat_features=cat_features
                        )

    # set parameters of model
    model = CatBoostClassifier(
        iterations=1000,
        random_strength=1, #set to one to prevent overfitting
        depth=6, #relatively low to prevent overfitting
        l2_leaf_reg=2, #from grid search
        border_count=32, #from grid search
        rsm=1, #from grid search
        loss_function='MultiClass',
        eval_metric='Accuracy',
        boosting_type = 'Plain',
        silent=True)


    # fit model and make predictions

    # model.fit(train_dataset, plot=True, eval_set=eval_dataset)
    model.fit(
        train_dataset,
        verbose_eval=100,
        eval_set=eval_dataset,
        plot=True
    );



    y_pred = model.predict(X_test)  # change to test_x if for real test

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)

    report = classification_report(y_test, y_pred)
    print(report)

    return [f1, accuracy]

In [None]:
x_cols = train_x

y_col = ['cyberbullying_type']

nb_splits = 5
splits =np.array_split(train_df, nb_splits)

In [None]:
# run the CV_catboost in a loop

f1_all = [0,0,0,0,0]
accuracy_all = [0,0,0,0,0]
scores = [0,0,0,0,0]
for i in range(nb_splits):
    not_test_indices = [x for x in range(nb_splits) if x != i]
    temp_df_X_y = (pd.concat([splits[x] for x in not_test_indices],axis=0))
    X_train = temp_df_X_y[x_cols]
    y_train = temp_df_X_y[y_col]
    X_test = splits[i][x_cols]
    y_test = splits[i][y_col]

    scores[i] =CV_catboost(X_train,y_train,X_test,y_test)

for i in range(len(scores)):
    f1_all[i] = scores[i][0]
    accuracy_all[i] = scores[i][1]

print("*"*100)
print("The cross validated f1 score is " + str(np.mean(f1_all)))
print("The cross validated accuracy is " + str(np.mean(accuracy_all)))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6453228	test: 0.6391655	best: 0.6391655 (0)	total: 117ms	remaining: 1m 57s
100:	learn: 0.7715252	test: 0.7751337	best: 0.7751337 (100)	total: 13.5s	remaining: 1m 59s
200:	learn: 0.8064634	test: 0.8075270	best: 0.8075270 (200)	total: 23.7s	remaining: 1m 34s
300:	learn: 0.8175504	test: 0.8150750	best: 0.8152846 (299)	total: 34s	remaining: 1m 18s
400:	learn: 0.8247844	test: 0.8222036	best: 0.8223084 (399)	total: 45.1s	remaining: 1m 7s
500:	learn: 0.8323330	test: 0.8250341	best: 0.8252437 (487)	total: 56.3s	remaining: 56s
600:	learn: 0.8371819	test: 0.8255582	best: 0.8260824 (548)	total: 1m 7s	remaining: 44.9s
700:	learn: 0.8408513	test: 0.8278646	best: 0.8282839 (697)	total: 1m 19s	remaining: 33.9s
800:	learn: 0.8445207	test: 0.8284936	best: 0.8289129 (743)	total: 1m 30s	remaining: 22.6s
900:	learn: 0.8481902	test: 0.8298564	best: 0.8299612 (899)	total: 1m 42s	remaining: 11.3s
999:	learn: 0.8516761	test: 0.8301709	best: 0.8304854 (990)	total: 1m 54s	remaining: 0us

bestTest = 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6382460	test: 0.6360205	best: 0.6360205 (0)	total: 113ms	remaining: 1m 52s
100:	learn: 0.7759809	test: 0.7710452	best: 0.7710452 (100)	total: 10.5s	remaining: 1m 33s
200:	learn: 0.8074070	test: 0.8017612	best: 0.8017612 (200)	total: 21s	remaining: 1m 23s
300:	learn: 0.8179960	test: 0.8120348	best: 0.8123493 (290)	total: 31.5s	remaining: 1m 13s
400:	learn: 0.8257804	test: 0.8179054	best: 0.8182199 (395)	total: 42.5s	remaining: 1m 3s
500:	learn: 0.8322543	test: 0.8226229	best: 0.8228326 (497)	total: 53.8s	remaining: 53.5s
600:	learn: 0.8368673	test: 0.8238809	best: 0.8246147 (593)	total: 1m 4s	remaining: 43.1s
700:	learn: 0.8410086	test: 0.8250341	best: 0.8256631 (675)	total: 1m 16s	remaining: 32.6s
800:	learn: 0.8450712	test: 0.8258727	best: 0.8262921 (779)	total: 1m 27s	remaining: 21.8s
900:	learn: 0.8491075	test: 0.8271307	best: 0.8278646 (840)	total: 1m 39s	remaining: 10.9s
999:	learn: 0.8520955	test: 0.8284936	best: 0.8289129 (987)	total: 1m 50s	remaining: 0us

bestTest 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6364208	test: 0.6324177	best: 0.6324177 (0)	total: 107ms	remaining: 1m 46s
100:	learn: 0.7702207	test: 0.7666177	best: 0.7666177 (100)	total: 10.5s	remaining: 1m 33s
200:	learn: 0.8052891	test: 0.8010065	best: 0.8010065 (198)	total: 20.7s	remaining: 1m 22s
300:	learn: 0.8171882	test: 0.8137974	best: 0.8140071 (297)	total: 30.9s	remaining: 1m 11s
400:	learn: 0.8251822	test: 0.8193542	best: 0.8194590 (391)	total: 41.8s	remaining: 1m 2s
500:	learn: 0.8314987	test: 0.8236528	best: 0.8238624 (497)	total: 53s	remaining: 52.7s
600:	learn: 0.8359281	test: 0.8253303	best: 0.8254351 (590)	total: 1m 4s	remaining: 42.8s
700:	learn: 0.8404623	test: 0.8274271	best: 0.8279514 (696)	total: 1m 15s	remaining: 32.4s
800:	learn: 0.8444986	test: 0.8282659	best: 0.8285804 (788)	total: 1m 27s	remaining: 21.7s
900:	learn: 0.8480631	test: 0.8283707	best: 0.8296289 (829)	total: 1m 38s	remaining: 10.9s
999:	learn: 0.8518635	test: 0.8291046	best: 0.8296289 (829)	total: 1m 50s	remaining: 0us

bestTest 

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6462232	test: 0.6415391	best: 0.6415391 (0)	total: 104ms	remaining: 1m 43s
100:	learn: 0.7710070	test: 0.7666177	best: 0.7666177 (100)	total: 10.5s	remaining: 1m 33s
200:	learn: 0.8055774	test: 0.8062487	best: 0.8062487 (195)	total: 20.6s	remaining: 1m 21s
300:	learn: 0.8180794	test: 0.8152653	best: 0.8161040 (283)	total: 31.1s	remaining: 1m 12s
400:	learn: 0.8254180	test: 0.8201929	best: 0.8201929 (395)	total: 41.8s	remaining: 1m 2s
500:	learn: 0.8308434	test: 0.8223946	best: 0.8223946 (500)	total: 52.9s	remaining: 52.7s
600:	learn: 0.8361378	test: 0.8253303	best: 0.8258545 (595)	total: 1m 4s	remaining: 42.6s
700:	learn: 0.8406458	test: 0.8272174	best: 0.8272174 (700)	total: 1m 15s	remaining: 32.4s
800:	learn: 0.8454684	test: 0.8286853	best: 0.8286853 (798)	total: 1m 27s	remaining: 21.8s
900:	learn: 0.8491639	test: 0.8296289	best: 0.8298385 (861)	total: 1m 39s	remaining: 10.9s
999:	learn: 0.8530691	test: 0.8291046	best: 0.8304676 (925)	total: 1m 51s	remaining: 0us

bestTes

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6421607	test: 0.6479346	best: 0.6479346 (0)	total: 106ms	remaining: 1m 45s
100:	learn: 0.7738376	test: 0.7764731	best: 0.7764731 (100)	total: 10.7s	remaining: 1m 35s
200:	learn: 0.8062326	test: 0.8014259	best: 0.8014259 (199)	total: 21.3s	remaining: 1m 24s
300:	learn: 0.8195209	test: 0.8111763	best: 0.8119103 (296)	total: 32s	remaining: 1m 14s
400:	learn: 0.8266761	test: 0.8163137	best: 0.8164185 (397)	total: 43.4s	remaining: 1m 4s
500:	learn: 0.8330712	test: 0.8193542	best: 0.8194590 (498)	total: 54.7s	remaining: 54.5s
600:	learn: 0.8375531	test: 0.8214510	best: 0.8220801 (586)	total: 1m 6s	remaining: 44.1s
700:	learn: 0.8425853	test: 0.8221849	best: 0.8229189 (669)	total: 1m 18s	remaining: 33.5s
800:	learn: 0.8464905	test: 0.8250157	best: 0.8251206 (789)	total: 1m 30s	remaining: 22.4s
900:	learn: 0.8495308	test: 0.8240721	best: 0.8252254 (806)	total: 1m 41s	remaining: 11.2s
999:	learn: 0.8524663	test: 0.8254351	best: 0.8258545 (971)	total: 1m 53s	remaining: 0us

bestTest 