In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

from random import uniform

from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Prepare running configuration 

In [None]:
social = "twitter"
topic = "elections"
seed = 42

dataset_filename = f"./{topic}/{social}/trees/{social}_{topic}_all_graphs_unified.parquet"

if social == "youtube":
    thread_identifier = 'video_id'
else:
    thread_identifier = 'conversation_id'

# Load Data

In [None]:
df_data = pd.read_parquet(dataset_filename)

# We remove all rows without a toxic label
df_data = df_data[df_data['is_toxic'].notna()]
print(df_data.shape)
print(df_data.dtypes)
df_data.head()

# Data preprocessing

## Feature Engineering

We add the following features to each conversation:

- Is the root toxic?
- Distance (in seconds) from the last comment
- Percentage of distinct users commenting

### Adding the feature regarding the toxicity of the root

In [None]:
# Is the root toxic?
thread_roots = df_data[df_data.children_index == 1]
thread_roots.loc[:, "is_root_toxic"] = thread_roots.toxicity_score > 0.6
thread_roots = thread_roots[[thread_identifier, "is_root_toxic"]]

df_data = df_data.merge(thread_roots, 'inner', left_on= thread_identifier, right_on = thread_identifier,  suffixes= (None, "_y"))

### Adding the feature regarding the distance in seconds from the last comments

In [None]:
df_data['created_at'] = pd.to_datetime(df_data['created_at'])
df_data['last_comments_diff_seconds'] = df_data.groupby(thread_identifier)['created_at'].diff().dt.total_seconds().fillna(0)

For each conversation, we extract the one pair of toxic and non toxic tweet

In [None]:
df_paired_tweets = pd.DataFrame({}, 
                                columns=df_data.columns)

df_data_to_evaluate = df_data[(df_data["toxicity_score"] <= 0.2) | (df_data["toxicity_score"] >= 0.6)]
number_of_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True].shape[0]
number_of_non_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False].shape[0]

downsample_size = min(number_of_non_toxic_tweets, number_of_toxic_tweets)
minority_class = np.argmin([number_of_non_toxic_tweets, number_of_toxic_tweets])

if minority_class == 0: # non toxic
    df_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True],
                                         n_samples = number_of_non_toxic_tweets,
                                         random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets,
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False], df_toxic_tweets_resampled], ignore_index=True)],
                                  ignore_index=True)
else:
    df_non_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False],
                                             n_samples = number_of_toxic_tweets,
                                             random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets, 
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True], df_non_toxic_tweets_resampled], ignore_index=True)], 
                                  ignore_index=True)
    
df_paired_tweets.is_toxic.value_counts()
    

In [None]:
df_bin_10_100 = df_paired_tweets.query("children_index >= 10 & children_index <= 100")
df_bin_100_1000 = df_paired_tweets.query("children_index > 100 & children_index <= 1000")
df_bin_1000_10000 = df_paired_tweets.query("children_index > 1000 & children_index <= 10000")

## Train/Test Split

# Create all-features model

In [None]:
def create_dataset(dataframe : pd.DataFrame, feature_labels, target_label):
    
    X = dataframe[feature_labels].to_numpy()
    
    y = dataframe[target_label].to_numpy()
    y = LabelEncoder().fit_transform(y)
    
    return X, y

In [None]:
feature_labels =  [
 'tree_size',
 'max_width',
 'max_depth',
 'number_of_unique_users',
 'toxicity_ratio',
 'assortativity',
 'avg_toxicity_distance',
 'wiener_index',
 'is_root_toxic',
 'last_comments_diff_seconds']

target_label = "is_toxic"

evaluation_metrics = ["accuracy", "roc_auc", "f1", "precision", "recall"]

number_of_folds = 10
seed = 42

RF_pipe = Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("std", StandardScaler(copy=True, with_mean=True, with_std=True)),
    ("clf", RandomForestClassifier(random_state = seed))
])

RF_stratified_k_fold = StratifiedKFold(n_splits = number_of_folds, 
                                       shuffle = True, 
                                       random_state = seed)

RF_grid = {

    "clf__n_estimators": [10, 50, 100, 1000],
    # 'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__criterion' :['gini', 'entropy']
}

RF_CV = GridSearchCV(
    estimator=RF_pipe, 
    param_grid=RF_grid, 
    cv=RF_stratified_k_fold,
    scoring= "accuracy",
    refit=True,
)

### Extract pairs of toxic/non toxic nodes from each conversation in the three bins
Moreover, we will be careful to cover the entire children index spectrum 

In [None]:
bins = [df_bin_10_100, df_bin_100_1000, df_bin_1000_10000]
# bins_to_test = [bin_10_100_test, bin_100_1000_test, bin_1000_10000_test]
min_bin = 10


print(f"{social} {topic} Classification Results")
for bin in bins:
    i = 0
    max_bin = min_bin * 10

    if min_bin == 10:
        print(f"Bin [{min_bin}, {max_bin}]")
    else:
        print(f"Bin ({min_bin}, {max_bin}]")   
    
    X, Y = create_dataset(bin, feature_labels, target_label)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        random_state=seed, 
                                                        test_size = 0.20, 
                                                        shuffle = True,
                                                        stratify=Y)

    print("Train size: " + str(X_train.shape[0]))
    print("Test size: " + str(X_test.shape[0]))

    RF_CV = RF_CV.fit(X_train, Y_train)

    Y_pred = RF_CV.predict(X_test)
    print()
    print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
    print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
    print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
    print('Test F1: %3f' % f1_score(Y_test, Y_pred))
    print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))
    confusion_matrix(Y_test, Y_pred)
    print()

    min_bin *= 10
    i = i + 1

### Overall

In [None]:
X, Y = create_dataset(df_paired_tweets, feature_labels, target_label)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    random_state=seed, 
                                                    test_size=0.20, 
                                                    shuffle = True,
                                                    stratify=Y)
print("Train size: " + str(X_train.shape[0]))
print("Test size: " + str(X_test.shape[0]))
RF_CV = RF_CV.fit(X_train, Y_train)
Y_pred = RF_CV.predict(X_test)

print()
print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
print('Test F1: %3f' % f1_score(Y_test, Y_pred))
print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))
confusion_matrix(Y_test, Y_pred)
print()