In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

import xgboost
from xgboost.sklearn import XGBClassifier

import pickle

In [None]:
def calculate_metrics(actual, predicted):
    """
    Calculate performance metrics given the actual and predicted labels.
    Returns the macro-F1 score, the accuracy, the flip error rate and the
    mean absolute error (MAE).
    The flip error rate is the percentage where an instance was predicted 
    as the opposite label (i.e., left-vs-right or high-vs-low).
    """
    # calculate macro-f1
    f1 = f1_score(actual, predicted, average='macro') * 100
    
    # calculate accuracy
    accuracy = accuracy_score(actual, predicted) * 100
    
    # calculate the flip error rate
    flip_err = sum([1 for i in range(len(actual)) if abs(actual[i] - predicted[i]) > 1]) / len(actual) * 100
    
    # calculate mean absolute error (mae)
    mae = sum([abs(actual[i] - predicted[i]) for i in range(len(actual))]) / len(actual)
    mae = mae[0] if not isinstance(mae, float) else mae
    
    return f1, accuracy, flip_err, mae



In [None]:
def train_and_test_xgboost_gridsearch(X, y, featurename, prefix):

    X_train, X_test, y_train_str, y_test_str = train_test_split(X, y, test_size=0.20, random_state=999)

    encoder = preprocessing.LabelEncoder()
    y_train = encoder.fit_transform(y_train_str)
    y_test = encoder.fit_transform(y_test_str)

    # normalize the features values
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    xgb_param_grid = {"max_depth"        : [ 4, 5, 6, 8, 10, 12],
                      "min_child_weight" : [ 3, 5, 7, 9, 11, 13 ] }    

    clf_cv = GridSearchCV(estimator = XGBClassifier( use_label_encoder=False, learning_rate =0.1, n_estimators=140, 
                                                      max_depth=5, min_child_weight=1, gamma=0.1, 
                                                      subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', nthread=4, 
                                                      scale_pos_weight=1, seed=27), 
                              param_grid = xgb_param_grid, scoring='roc_auc',
                              n_jobs=8, cv=5,
                             return_train_score=True)
    
    
    clf_cv.fit(X_train, y_train)
    
    print(clf_cv.best_params_, clf_cv.best_score_)
    
    clf = xgboost.XGBClassifier(use_label_encoder=False, learning_rate =0.1, n_estimators=140, 
                                max_depth=clf_cv.best_params_["max_depth"],
                                min_child_weight=clf_cv.best_params_["min_child_weight"],
                                gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                  objective= 'binary:logistic', nthread=4, 
                                  scale_pos_weight=1, seed=27)    
    clf.fit(X_train, y_train)
    
    file_name = f"./models/{prefix}_{featurename}.plk"
    pickle.dump(clf, open(file_name, "wb"))
    
    # generate predictions
    pred = clf.predict(X_test)

    # calculate the performance metrics on the whole set of predictions (5 folds all together)
    actual = y_test
    predicted = pred
    results = calculate_metrics(actual, predicted)

    return clf, results


## Define features

In [None]:
### get Profile Description encoded by Setence BERT (SBERT) feature names
feat_description_sbert = []
for cat in range(768):
    feat_description_sbert.append("desc"+str(cat))

### get User Tweets encoded by SBERT feature names
feat_precovid_tweets_sbert = []
for cat in range(768):
    feat_precovid_tweets_sbert.append("s"+str(cat))    
    
### get NELA feature names 
feat_nela = ["quotes", "exclaim", "allpunc", "allcaps", "stops", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "WP$", "WRB", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "$", "\'\'", "(", ")", ",", "--", ".", ":", "``", "ttr", "avg_wordlen", "word_count", "flesch_kincaid_grade_level", "smog_index", "coleman_liau_index", "lix", "bias_words", "assertatives", "factives", "hedges", "implicatives", "report_verbs", "positive_opinion_words", "negative_opinion_words", "vadneg", "vadneu", "vadpos", "wneg", "wpos", "wneu", "sneg", "spos", "sneu", "HarmVirtue", "HarmVice", "FairnessVirtue", "FairnessVice", "IngroupVirtue", "IngroupVice", "AuthorityVirtue", "AuthorityVice", "PurityVirtue", "PurityVice", "MoralityGeneral"]

### get Following top users  feature names
feat_following_top = []
for cat in range(95):
    feat_following_top.append("followtop_"+str(cat))
    
### get LIWC feature names 
feat_liwc = ['function (Function Words)', 'pronoun (Pronouns)', 'ppron (Personal Pronouns)', 'i (I)', 'we (We)', 'you (You)', 'shehe (SheHe)', 'they (They)', 'ipron (Impersonal Pronouns)', 'article (Articles)', 'prep (Prepositions)', 'auxverb (Auxiliary Verbs)', 'adverb (Adverbs)', 'conj (Conjunctions)', 'negate (Negations)', 'verb (Verbs)', 'adj (Adjectives)', 'compare (Comparisons)', 'interrog (Interrogatives)', 'number (Numbers)', 'quant (Quantifiers)', 'affect (Affect)', 'posemo (Positive Emotions)', 'negemo (Negative Emotions)', 'anx (Anx)', 'anger (Anger)', 'sad (Sad)', 'social (Social)', 'family (Family)', 'friend (Friends)', 'female (Female)', 'male (Male)', 'cogproc (Cognitive Processes)', 'insight (Insight)', 'cause (Causal)', 'discrep (Discrepancies)', 'tentat (Tentative)', 'certain (Certainty)', 'differ (Differentiation)', 'percept (Perceptual Processes)', 'see (See)', 'hear (Hear)', 'feel (Feel)', 'bio (Biological Processes)', 'body (Body)', 'health (Health)', 'sexual (Sexual)', 'ingest (Ingest)', 'drives (Drives)', 'affiliation (Affiliation)', 'achieve (Achievement)', 'power (Power)', 'reward (Reward)', 'risk (Risk)', 'focuspast (Past Focus)', 'focuspresent (Present Focus)', 'focusfuture (Future Focus)', 'relativ (Relativity)', 'motion (Motion)', 'space (Space)', 'time (Time)', 'work (Work)', 'leisure (Leisure)', 'home (Home)', 'money (Money)', 'relig (Religion)', 'death (Death)', 'informal (Informal Language)', 'swear (Swear)', 'netspeak (Netspeak)', 'assent (Assent)', 'nonflu (Nonfluencies)', 'filler (Filler Words)']

features = {}
features['stat'] = ['days_since_join', 'user_favourites_count', 'user_followers_count', 'user_friends_count', 'user_statuses_count', 'user_verified']
features['description'] = feat_description_sbert
features['following'] = feat_following_top

features['stat_description'] = features['stat'] + features['description']
features['description_following'] = features['description'] + features['following']
features['stat_following'] = features['stat'] + features['following']
features['stat_description_following'] = features['stat'] + features['description'] + features['following']

features['media'] = ['extremeleft','left','leftcenter','center','rightcenter','right','extremeright','very high','high','mostly factual','mixed','low','very low','questionable source']
features['NELA'] = feat_nela
features['LIWC'] = feat_liwc
features['tweets'] = feat_precovid_tweets_sbert

features['media_NELA'] = features['media']+features['NELA']
features['media_LIWC'] = features['media']+features['LIWC']
features['media_tweets'] = features['media']+features['tweets']
features['NELA_LIWC'] = features['NELA']+features['LIWC']
features['NELA_tweets'] = features['NELA']+features['tweets']
features['LIWC_tweets'] = features['LIWC']+features['tweets']

features['media_NELA_LIWC'] = features['media']+features['NELA']+features['LIWC']
features['NELA_LIWC_tweets'] = features['NELA']+features['LIWC']+features['tweets']
features['media_LIWC_tweets'] = features['media']+features['LIWC']+features['tweets']
features['media_NELA_tweets'] = features['media']+features['NELA']+features['tweets']
features['media_NELA_LIWC_tweets'] = features['media']+features['NELA']+features['LIWC']+features['tweets']

features['ALL'] = features['stat_description_following'] + features['media_NELA_LIWC_tweets']

## BEST combination for Hateful User Prediction
features['stat_description_following_media_LIWC'] = features['stat_description_following'] + features['media_LIWC']
features['stat_description_following_media_tweets'] = features['stat_description_following'] + features['media_tweets']

## BEST combination for High-level Hateful User Prediction
features['stat_following_media_NELA_LIWC'] = features['stat_following'] + features['media_NELA_LIWC']


## Load dataset (features) 

A column named 'y' is the lable for whether a user belongs to hateful or reference user group (Hateful user prediction (Task 1)). 
Y=1 is hateful and Y=0 is reference. 

A column named 'y_level' is the labe for high- and low-level hateful users .
Y=1 is high-level hateful user, Y=0 is low-level hateful users, Y=-1 is reference use and excluded from High-level hateful user prediction (Task1). 


In [None]:
# Note that "user_features_hate_ref_SAMPLE.tsv" is a sample file with 80 rows, just for reference.
df = pd.read_csv("user_features_hate_ref_SAMPLE.tsv", sep="\t")
print(df.shape)
df.head()


## Predicting Hateful Users (Task 1)

In [None]:

with open(f"./pred_results_xgboost.tsv", "w") as output:

    output.write("\t".join(["Feature Set", "Macro-F1", "Accuracy", "Flip error-rate", "MAE"])+"\n")

    for featurename in features.keys():
        print(featurename, "------")
        featurelist = features[featurename]

        X = df[featurelist]
        print(X.shape)
        y = df['y']

        best_model, results= train_and_test_xgboost_gridsearch(X, y, featurename, "xgb")

        results_str = "\t".join([str(each_result) for each_result in results])
        output.write("\t".join([featurename, results_str])+"\n")


## Predicting High-level Hateful Users (Task 2) 

In [None]:
df_hate = df.query("y_level != -1")
print(df_hate.y_level.value_counts())

with open(f"./pred_results_xgboost_level.tsv", "w") as output:

    output.write("\t".join(["Feature Set", "Macro-F1", "Accuracy", "Flip error-rate", "MAE"])+"\n")

    for featurename in features.keys():
        print(featurename)
        featurelist = features[featurename]

        X = df_hate[featurelist]
        print(X.shape)
        y = df_hate['y_level']

        best_model, results = train_and_test_xgboost_gridsearch(X, y, featurename, "level_xgb")

        results_str = "\t".join([str(each_result) for each_result in results])
        output.write("\t".join([featurename, results_str])+"\n")
