In [None]:
import pandas as pd
import numpy as np
!pip install catboost
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
##Call the data
df = pd.read_csv('/content/drive/MyDrive/Ensemble Learning/Tweets/all_features.csv', index_col=0)
print(df.head(2))
df = df.sample(frac=1,random_state=0).reset_index(drop=True)
print(df.head(2))

section = round(len(df)*8/10)
train_df = df.iloc[:section]
test_df = df.iloc[section:]

train_cols_x = list(train_df.drop(['cyberbullying_type', 'tweet_text'], axis=1).columns)
test_cols_x = list(test_df.drop(['cyberbullying_type', 'tweet_text'], axis=1).columns)

train_x = train_df[train_cols_x]
test_x = test_df[test_cols_x]

train_y = train_df['cyberbullying_type']
test_y = test_df['cyberbullying_type']
test_y = test_y.reset_index(drop=True)
test_x = test_x.reset_index(drop=True)

train_y = np.transpose(np.array(train_y)).ravel()
test_y = np.transpose(np.array(test_y)).ravel()

In [None]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,nb_len_1,nb_len_2,...,muslim,gay,round,good,radical,bad,mkr,rape,stupid,lot
0,@slainv_fr just because you closely associate ...,other_cyberbullying,123,20,0,19,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,"RT @PlayHearthstone: Roses are red,",other_cyberbullying,35,5,1,2,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"What's happening in WB, why you idiots not wri...",religion,119,20,1,16,3,0,0,3,...,0,0,0,0,0,0,0,0,0,0
3,"“To every girl who bullied me in high school, ...",age,78,15,0,14,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
4,"Even if he has Parkinson’s, people on twitter ...",gender,125,22,0,19,3,0,0,5,...,0,1,0,0,0,0,0,1,0,0


In [None]:
#define dataset used to train the model
train_dataset = Pool(data=train_x,
                     label=train_y,
                     )

#define dataset used to test the model
eval_dataset = Pool(data=test_x,

                    )

#set model parameters
model = CatBoostClassifier(
    iterations=2000,
    random_strength=0.5, #reduce overfitting
    depth=6, #depth of the tree
    l2_leaf_reg=2,
    border_count=32,
    rsm = 1,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    boosting_type = 'Plain',
    verbose = 200
)


model.fit(train_dataset, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6417152	total: 200ms	remaining: 6m 39s
200:	learn: 0.8053939	total: 30.7s	remaining: 4m 34s
400:	learn: 0.8268334	total: 1m	remaining: 4m 1s
600:	learn: 0.8377365	total: 1m 27s	remaining: 3m 22s
800:	learn: 0.8459139	total: 1m 54s	remaining: 2m 50s
1000:	learn: 0.8525187	total: 2m 22s	remaining: 2m 22s
1200:	learn: 0.8595167	total: 2m 50s	remaining: 1m 53s
1400:	learn: 0.8644441	total: 3m 18s	remaining: 1m 25s
1600:	learn: 0.8698433	total: 3m 47s	remaining: 56.7s
1800:	learn: 0.8748755	total: 4m 17s	remaining: 28.5s
1999:	learn: 0.8792787	total: 4m 46s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fe0bf76e9d0>

In [None]:
## grid search for catboost.

param_grid = {'depth'         : [2,6,8,10],
              'learning_rate' : [0.01,0.02,0.03,0.04],
              'iterations'    : [500,1000,1500,2000,2500,3000]
              }
    
grid_search = GridSearchCV(estimator=model , param_grid = param_grid, cv = 2, n_jobs=-1)
grid_search.fit(train_x, train_y)


In [None]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_search.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_search.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_search.best_params_)

 Results from Grid Search 


NameError: ignored

In [None]:
y_pred = model.predict(test_x)
f1 = f1_score(test_y, y_pred, average='macro')
accuracy = accuracy_score(test_y, y_pred)
print("f1 score is " + str(f1))
print("accuracy is " + str(accuracy))

f1 score is 0.8324936838241564
accuracy is 0.8313063535332355


In [None]:
importances = model.feature_importances_
df_feat_imp = pd.DataFrame()
df_feat_imp['Features'] = train_cols_x
df_feat_imp['Importance'] = importances.tolist()
df_feat_imp = df_feat_imp.sort_values(by='Importance',ascending=False)
df_feat_imp.head(10)

Unnamed: 0,Features,Importance
156,school,8.001475
0,characters per tweet,7.680031
123,bully,7.417006
135,nigger,6.820765
64,@,4.777468
174,rape,3.301068
4,nb_capitalized,2.998782
1,words per tweet,2.395927
117,dumb,2.326649
126,joke,2.32147
