In [None]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
!pip install catboost
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

!pip install shap
import shap


from hyperopt import fmin, hp, tpe

import matplotlib.pyplot as plt
import seaborn as sns


Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4
Collecting shap
  Downloading shap-0.40.0-cp37-cp37m-manylinux2010_x86_64.whl (564 kB)
[K     |████████████████████████████████| 564 kB 6.7 MB/s 
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.40.0 slicer-0.0.7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Ensemble Learning/Tweets/df_all_features_tfidf.csv', index_col=0)
train_df = train_df.sample(frac=1,random_state=1).reset_index(drop=True)

train_x = list(train_df.drop(['tweet_text','cyberbullying_type'], axis=1).columns)
train_x

train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,muslim,gay,round,good,radical,bad,mkr,rape,stupid,lot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#CatBoost


In [1]:
def CV_catboost(X_train,y_train,X_test,y_test):

    y_train = np.transpose(np.array(y_train)).ravel()
    y_test = np.transpose(np.array(y_test)).ravel()

    # Training
    train_dataset = Pool(data=X_train,
                         label=y_train,
                         # cat_features=cat_features
                         )

    eval_dataset = Pool(data=X_test,
                        label=y_test,
                        # cat_features=cat_features
                        )

    # set parameters of model
    model = CatBoostClassifier(
        iterations=1000,
        random_strength=1, #set to one to prevent overfitting
        depth=6, #relatively low to prevent overfitting
        l2_leaf_reg=2, #from grid search
        border_count=32, #from grid search
        rsm=1, #from grid search
        loss_function='MultiClass',
        eval_metric='Accuracy',
        boosting_type = 'Plain',
        silent=True)


    # fit model and make predictions

    # model.fit(train_dataset, plot=True, eval_set=eval_dataset)
    model.fit(
        train_dataset,
        verbose_eval=100,
        eval_set=eval_dataset,
        plot=True
    );

    y_pred_train = model.predict(X_train)
    f1_train = f1_score(y_train, y_pred_train, average='macro')
    accuracy_train = accuracy_score(y_train, y_pred_train)
    y_pred = model.predict(X_test)  # change to test_x if for real test

    f1 = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)

    report = classification_report(y_test, y_pred)
    print(report)

    return [f1, accuracy, f1_train,accuracy_train]

In [None]:
x_cols = train_x

y_col = ['cyberbullying_type']

nb_splits = 5
splits =np.array_split(train_df, nb_splits)

In [None]:
# run the CV_catboost in a loop

f1_all = [0,0,0,0,0]
accuracy_all = [0,0,0,0,0]
scores = [0,0,0,0,0]
for i in range(nb_splits):
    not_test_indices = [x for x in range(nb_splits) if x != i]
    temp_df_X_y = (pd.concat([splits[x] for x in not_test_indices],axis=0))
    X_train = temp_df_X_y[x_cols]
    y_train = temp_df_X_y[y_col]
    X_test = splits[i][x_cols]
    y_test = splits[i][y_col]

    scores[i] =CV_catboost(X_train,y_train,X_test,y_test)

for i in range(len(scores)):
    f1_all[i] = scores[i][0]
    accuracy_all[i] = scores[i][1]

print("*"*100)
print("The cross validated f1 score is " + str(np.mean(f1_all)))
print("The cross validated accuracy is " + str(np.mean(accuracy_all)))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6329253	test: 0.6349722	best: 0.6349722 (0)	total: 522ms	remaining: 8m 41s
100:	learn: 0.7944329	test: 0.7964147	best: 0.7965196 (99)	total: 43.8s	remaining: 6m 29s
200:	learn: 0.8209315	test: 0.8230422	best: 0.8230422 (200)	total: 1m 23s	remaining: 5m 31s
300:	learn: 0.8311797	test: 0.8302757	best: 0.8302757 (300)	total: 2m 2s	remaining: 4m 44s
400:	learn: 0.8402485	test: 0.8363560	best: 0.8367753 (392)	total: 2m 42s	remaining: 4m 3s
500:	learn: 0.8464603	test: 0.8413880	best: 0.8417025 (495)	total: 3m 22s	remaining: 3m 21s
600:	learn: 0.8508112	test: 0.8429605	best: 0.8433798 (597)	total: 4m 4s	remaining: 2m 42s
700:	learn: 0.8533012	test: 0.8433798	best: 0.8440088 (687)	total: 4m 44s	remaining: 2m 1s
800:	learn: 0.8554504	test: 0.8448475	best: 0.8448475 (789)	total: 5m 25s	remaining: 1m 20s
900:	learn: 0.8578356	test: 0.8458958	best: 0.8458958 (893)	total: 6m 4s	remaining: 40.1s
999:	learn: 0.8597751	test: 0.8465248	best: 0.8467345 (920)	total: 6m 45s	remaining: 0us

bes

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6468954	test: 0.6483908	best: 0.6483908 (0)	total: 646ms	remaining: 10m 45s
100:	learn: 0.7963463	test: 0.7914876	best: 0.7914876 (100)	total: 43.9s	remaining: 6m 31s
200:	learn: 0.8194899	test: 0.8140266	best: 0.8140266 (200)	total: 1m 24s	remaining: 5m 34s
300:	learn: 0.8303672	test: 0.8237761	best: 0.8237761 (298)	total: 2m 3s	remaining: 4m 46s
400:	learn: 0.8397767	test: 0.8341545	best: 0.8346787 (398)	total: 2m 43s	remaining: 4m 4s
500:	learn: 0.8458050	test: 0.8371947	best: 0.8371947 (499)	total: 3m 24s	remaining: 3m 23s
600:	learn: 0.8498152	test: 0.8399203	best: 0.8399203 (600)	total: 4m 5s	remaining: 2m 43s
700:	learn: 0.8529342	test: 0.8422266	best: 0.8422266 (700)	total: 4m 46s	remaining: 2m 2s
800:	learn: 0.8554504	test: 0.8426460	best: 0.8426460 (800)	total: 5m 28s	remaining: 1m 21s
900:	learn: 0.8570492	test: 0.8426460	best: 0.8436943 (832)	total: 6m 9s	remaining: 40.6s
999:	learn: 0.8594868	test: 0.8425411	best: 0.8436943 (832)	total: 6m 50s	remaining: 0us

b

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6414531	test: 0.6447893	best: 0.6447893 (0)	total: 940ms	remaining: 15m 39s
100:	learn: 0.7955653	test: 0.7932481	best: 0.7932481 (100)	total: 44.4s	remaining: 6m 35s
200:	learn: 0.8202810	test: 0.8164185	best: 0.8164185 (200)	total: 1m 24s	remaining: 5m 35s
300:	learn: 0.8301358	test: 0.8261690	best: 0.8261690 (300)	total: 2m 2s	remaining: 4m 45s
400:	learn: 0.8394926	test: 0.8348710	best: 0.8350807 (399)	total: 2m 42s	remaining: 4m 2s
500:	learn: 0.8459663	test: 0.8396939	best: 0.8400084 (497)	total: 3m 22s	remaining: 3m 22s
600:	learn: 0.8495571	test: 0.8430489	best: 0.8431537 (596)	total: 4m 3s	remaining: 2m 41s
700:	learn: 0.8526498	test: 0.8444118	best: 0.8445167 (690)	total: 4m 42s	remaining: 2m
800:	learn: 0.8551397	test: 0.8449360	best: 0.8453554 (782)	total: 5m 23s	remaining: 1m 20s
900:	learn: 0.8582062	test: 0.8460893	best: 0.8462990 (870)	total: 6m 3s	remaining: 40s
999:	learn: 0.8603554	test: 0.8475571	best: 0.8477668 (948)	total: 6m 43s	remaining: 0us

bestTe

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6335116	test: 0.6213043	best: 0.6213043 (0)	total: 624ms	remaining: 10m 23s
100:	learn: 0.7969544	test: 0.7945062	best: 0.7946110 (99)	total: 43.8s	remaining: 6m 29s
200:	learn: 0.8194947	test: 0.8201929	best: 0.8210317 (196)	total: 1m 23s	remaining: 5m 33s
300:	learn: 0.8316035	test: 0.8303628	best: 0.8303628 (298)	total: 2m 4s	remaining: 4m 48s
400:	learn: 0.8395712	test: 0.8380164	best: 0.8386454 (396)	total: 2m 44s	remaining: 4m 5s
500:	learn: 0.8438434	test: 0.8429440	best: 0.8430489 (497)	total: 3m 25s	remaining: 3m 25s
600:	learn: 0.8487446	test: 0.8451457	best: 0.8451457 (600)	total: 4m 6s	remaining: 2m 43s
700:	learn: 0.8525712	test: 0.8462990	best: 0.8468232 (693)	total: 4m 47s	remaining: 2m 2s
800:	learn: 0.8546941	test: 0.8483959	best: 0.8483959 (798)	total: 5m 28s	remaining: 1m 21s
900:	learn: 0.8571316	test: 0.8480814	best: 0.8488153 (814)	total: 6m 9s	remaining: 40.6s
999:	learn: 0.8593070	test: 0.8490250	best: 0.8493395 (979)	total: 6m 49s	remaining: 0us

be

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6532212	test: 0.6531768	best: 0.6531768 (0)	total: 550ms	remaining: 9m 9s
100:	learn: 0.7974000	test: 0.7995387	best: 0.7995387 (100)	total: 45.6s	remaining: 6m 46s
200:	learn: 0.8199140	test: 0.8192493	best: 0.8192493 (200)	total: 1m 26s	remaining: 5m 42s
300:	learn: 0.8320753	test: 0.8294192	best: 0.8297337 (299)	total: 2m 6s	remaining: 4m 53s
400:	learn: 0.8407506	test: 0.8338226	best: 0.8340323 (397)	total: 2m 46s	remaining: 4m 8s
500:	learn: 0.8462284	test: 0.8386454	best: 0.8390648 (493)	total: 3m 27s	remaining: 3m 26s
600:	learn: 0.8509462	test: 0.8397987	best: 0.8405326 (591)	total: 4m 8s	remaining: 2m 44s
700:	learn: 0.8542224	test: 0.8412665	best: 0.8413714 (698)	total: 4m 48s	remaining: 2m 3s
800:	learn: 0.8569744	test: 0.8427343	best: 0.8429440 (736)	total: 5m 29s	remaining: 1m 21s
900:	learn: 0.8589663	test: 0.8432585	best: 0.8436779 (832)	total: 6m 10s	remaining: 40.7s
999:	learn: 0.8604340	test: 0.8445167	best: 0.8448312 (986)	total: 6m 51s	remaining: 0us

be

In [None]:
 model = CatBoostClassifier(
        iterations=1000,
        random_strength=1, #set to one to prevent overfitting
        depth=6, #relatively low to prevent overfitting
        l2_leaf_reg=2, #from grid search
        border_count=32, #from grid search
        rsm=1, #from grid search
        loss_function='MultiClass',
        eval_metric='Accuracy',
        boosting_type = 'Plain',
        silent=True)

In [None]:
train_y = ['cyberbullying_type']
X = train_df[train_x]
y = train_df[train_y]

In [None]:
from yellowbrick.model_selection import learning_curve
## selecting which model to run 
print(learning_curve(model, X, y, cv=2, scoring='accuracy'))

YellowbrickTypeError: ignored