In [26]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.compose import ColumnTransformer


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from scipy.stats import skew

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_df = pd.read_csv('/content/drive/MyDrive/Ensemble Learning/Tweets/all_features.csv', index_col=0)
train_df.head()

Unnamed: 0,tweet_text,cyberbullying_type,characters per tweet,words per tweet,nb_upper,nb_lower,nb_capitalized,mixed_upper_lower_not_capitalized,nb_len_1,nb_len_2,...,muslim,gay,round,good,radical,bad,mkr,rape,stupid,lot
0,"In other words #katandandre, your food was cra...",not_cyberbullying,61,9,0,8,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,115,14,1,9,1,0,0,2,...,0,0,0,0,0,0,1,0,0,0
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,60,9,0,7,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,103,18,1,16,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,103,18,1,12,4,0,1,6,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train_y = train_df[['cyberbullying_type']]
train_x = train_df.drop(['cyberbullying_type','tweet_text'], axis = 1)

In [6]:
## transforming the y column so that it works with the rest of the model
train_y = np.array(train_y)
train_y = np.transpose(train_y)
train_y = train_y.ravel()

In [7]:
## the number of numerical features
num_features_count = 76

## extract the column names from the dataframe.
train_x_columns = list(train_x.columns)

##selecting which columns are categorical and which are numeric.
numerical_columns = train_x_columns[0:num_features_count]
categorical_columns = train_x_columns[num_features_count:]

In [8]:
## splitting the training data into a training set (80%) and a test set (20%)
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

In [9]:
## set this to true if transformations should be applied
transform = False

if transform == True:
    for k in numerical_columns:
        ## these parameter specify the skewness thresholds for transformations
        ## these were changed to investigate the impact
        skew_sqrt = 2
        skew_log = 5
        temp_skew = round(skew(train_x[k]),3)
        if temp_skew > skew_sqrt:
            train_x[k] = np.sqrt(train_x[k])
            trans_skew = round(skew(np.sqrt(train_x[k])),3)
        if temp_skew > skew_log:
            train_x[k] = np.log(1+train_x[k])
            trans_skew = round(skew(np.log(1+train_x[k])),3)
        else:
            trans_skew = temp_skew
        
        print(k, temp_skew, trans_skew)

## the results of the datatransformations had a neglible impact on the final results (f1 score incr of 0.02)
## as such, the transformations have not been considered further.

In [10]:
## selecting the features to use (numberic and categorical) - also seletcs how to fill in missing data and how to scale data
numeric_features = numerical_columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

categorical_features = categorical_columns

## how to handle missing data and also to use onehot encoder for categorical variables.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


## creating a preprocessor to combine the categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [23]:
## different models to use 
def model(choice):
    
    if choice == 'AB':
        return AdaBoostClassifier(n_estimators=100, random_state=0)
    if choice == 'DT':
        return DecisionTreeClassifier(random_state=0)
    if choice == 'BG':
        return BaggingClassifier(base_estimator=SVC(),n_estimators=2, random_state=0)
    if choice == 'RF':
        return RandomForestClassifier(criterion = "gini", max_depth=25, min_samples_leaf = 1, n_estimators=300, random_state=0, class_weight = 'balanced_subsample', bootstrap=True, n_jobs=-1)
    

In [27]:
## selecting which model to run 
choice = "AB"
MLmodel = model(choice)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MLmodel)])

## fitting the model to the train data split
fit_model = True
if fit_model:
    clf.fit(train_x, train_y)

## using the model to predict the y test values using the test data
y_pred = clf.predict(test_x)

In [28]:
## Test scores
print(classification_report(test_y, y_pred))
print('F1 score:',round(f1_score(test_y,y_pred, average = 'macro'),3))

                     precision    recall  f1-score   support

                age       0.87      0.98      0.92      1557
          ethnicity       0.94      0.86      0.90      1627
             gender       0.89      0.72      0.79      1626
  not_cyberbullying       0.51      0.45      0.48      1572
other_cyberbullying       0.54      0.66      0.60      1519
           religion       0.85      0.92      0.88      1638

           accuracy                           0.76      9539
          macro avg       0.77      0.76      0.76      9539
       weighted avg       0.77      0.76      0.76      9539

F1 score: 0.761


In [None]:
runCV = True
## run cross validation on fitted model
scoring = {'acc': 'accuracy',
           'f1_macro': 'f1_macro'}
if runCV:
    scores = cross_validate(clf, train_x, train_y, cv=5, scoring=scoring)
    #print(scores)
    #print(np.average(scores))
    print("-----------------------------------")
    print("CV Accuracy:", round(np.average(scores['test_acc']),4))
    print("CV f1 Score:", round(np.average(scores['test_f1_macro']),4))
    print("-----------------------------------")

-----------------------------------
CV Accuracy: 0.816
CV f1 Score: 0.82
-----------------------------------


In [None]:
## code to extract the most important features
get_imp = True
if get_imp:    
    importances = clf.steps[1][1].feature_importances_
    feature_names = clf['preprocessor'].transformers_[1][1]['onehot']\
                   .get_feature_names(categorical_features)               
    df_feat_imp = pd.DataFrame()
    df_feat_imp['Features'] = numeric_features + list(feature_names)
    df_feat_imp['Importance'] = importances.tolist()
    df_feat_imp = df_feat_imp.sort_values(by='Importance',ascending=False)
    print(df_feat_imp.head(10))
    df_feat_imp.to_csv("/content/drive/MyDrive/Ensemble Learning/feature_importance_RF.csv")  

                 Features  Importance
237            school_1.0    0.053506
236            school_0.0    0.047208
171             bully_1.0    0.039584
170             bully_0.0    0.035380
195            nigger_1.0    0.031114
158              dumb_0.0    0.030619
194            nigger_0.0    0.027424
159              dumb_1.0    0.021857
0    characters per tweet    0.021591
272              rape_0.0    0.021015




In [None]:
## grid search for random forest.
gs = False
if gs:
    param_grid = {
        'classifier__alpha': [0,0.5,1,1.5,2,10],
        'classifier__fit_prior': [True,False],
        'classifier__class_weight': ["balanced",'balanced_subsample'],
        'classifier__max_depth': [10,20,30],
        'classifier__n_estimators': [100,200],
        'classifier__min_samples_leaf': [1,2,4],
    }
    
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro', verbose = 10,n_jobs = -1)
    grid_search.fit(train_x, train_y)

    print(("best RF from grid search: %.3f"
           % grid_search.score(train_x, train_y)))