In [164]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance

import pandas as pd
import numpy as np

from random import uniform

from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Prepare running configuration 

In [165]:
social = "twitter"
topic = "football"
adversarial_topic = "elections"

seed = 42

dataset_filename = f"/media/gabett/Volume/data-repository/panconesi-football-elections/{topic}/{social}/trees/{social}_{topic}_all_graphs_unified.parquet"

if social == "youtube":
    thread_identifier = 'video_id'
else:
    thread_identifier = 'conversation_id'

# Load Data

In [166]:
df_data = pd.read_parquet(dataset_filename)

# We remove all rows without a toxic label
df_data = df_data[df_data['is_toxic'].notna()]
print(df_data.shape)
print(df_data.dtypes)
df_data.head()

(930129, 18)
children_index                          int32
conversation_id                       float64
id                                     object
parent_id                              object
created_at                datetime64[ns, UTC]
root                                   object
toxicity_score                        float64
tree_size                             float64
max_width                             float64
max_depth                             float64
number_of_unique_users                float64
toxicity_ratio                        float64
assortativity                         float64
avg_toxicity_distance                 float64
wiener_index                          float64
is_toxic                               object
social                                 object
topic                                  object
dtype: object


Unnamed: 0,children_index,conversation_id,id,parent_id,created_at,root,toxicity_score,tree_size,max_width,max_depth,number_of_unique_users,toxicity_ratio,assortativity,avg_toxicity_distance,wiener_index,is_toxic,social,topic
0,1,1.563036e+18,1563037932386078720,1.563036027912e+18,2022-08-26 05:37:37+00:00,1.563036027912e+18,0.004869,2.0,1.0,1.0,2.0,0.0,,,1.0,False,twitter,football
1,1,1.563077e+18,1563077671764250624,1.563076923966e+18,2022-08-26 08:15:32+00:00,1.563076923966e+18,0.008733,2.0,1.0,1.0,2.0,0.0,,,1.0,False,twitter,football
2,2,1.563077e+18,1563082289977835520,1.563076923966e+18,2022-08-26 08:33:53+00:00,1.563076923966e+18,0.22579,3.0,2.0,1.0,3.0,0.0,,,1.0,False,twitter,football
3,1,1.563083e+18,1563084637131251712,1.563082790375e+18,2022-08-26 08:43:12+00:00,1.563082790375e+18,0.211073,2.0,1.0,1.0,2.0,0.0,,,1.0,False,twitter,football
4,1,1.563146e+18,1563146712042323968,1.56314636959e+18,2022-08-26 12:49:52+00:00,1.56314636959e+18,0.058985,2.0,1.0,1.0,2.0,0.0,,,1.0,False,twitter,football


# Data preprocessing

## Feature Engineering

We add the following features to each conversation:

- Is the root toxic?
- Distance (in seconds) from the last comment
- Percentage of distinct users commenting

### Adding the feature regarding the toxicity of the root

In [167]:
# Is the root toxic?
thread_roots = df_data[df_data.children_index == 1]
thread_roots.loc[:, "is_root_toxic"] = thread_roots.toxicity_score > 0.6
thread_roots = thread_roots[[thread_identifier, "is_root_toxic"]]

df_data = df_data.merge(thread_roots, 'inner', left_on= thread_identifier, right_on = thread_identifier,  suffixes= (None, "_y"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  thread_roots.loc[:, "is_root_toxic"] = thread_roots.toxicity_score > 0.6


### Adding the feature regarding the distance in seconds from the last comments

In [168]:
df_data['created_at'] = pd.to_datetime(df_data['created_at'])
df_data['last_comments_diff_seconds'] = df_data.groupby(thread_identifier)['created_at'].diff().dt.total_seconds().fillna(0)

For each conversation, we extract the one pair of toxic and non toxic tweet

In [169]:
df_paired_tweets = pd.DataFrame({}, 
                                columns=df_data.columns)

df_data_to_evaluate = df_data[(df_data["toxicity_score"] <= 0.2) | (df_data["toxicity_score"] >= 0.6)]
number_of_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True].shape[0]
number_of_non_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False].shape[0]

downsample_size = min(number_of_non_toxic_tweets, number_of_toxic_tweets)
minority_class = np.argmin([number_of_non_toxic_tweets, number_of_toxic_tweets])

if minority_class == 0: # non toxic
    df_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True],
                                         n_samples = number_of_non_toxic_tweets,
                                         random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets,
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False], df_toxic_tweets_resampled], ignore_index=True)],
                                  ignore_index=True)
else:
    df_non_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False],
                                             n_samples = number_of_toxic_tweets,
                                             random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets, 
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True], df_non_toxic_tweets_resampled], ignore_index=True)], 
                                  ignore_index=True)
    
df_paired_tweets.is_toxic.value_counts()
    

True     26356
False    26356
Name: is_toxic, dtype: int64

In [170]:
df_bin_10_100 = df_paired_tweets.query("children_index >= 10 & children_index <= 100")
df_bin_100_1000 = df_paired_tweets.query("children_index > 100 & children_index <= 1000")
df_bin_1000_10000 = df_paired_tweets.query("children_index > 1000 & children_index <= 10000")

## Train/Test Split

# Create all-features model

In [171]:
def create_dataset(dataframe : pd.DataFrame, feature_labels, target_label):
    
    X = dataframe[feature_labels].to_numpy()
    
    y = dataframe[target_label].to_numpy()
    y = LabelEncoder().fit_transform(y)
    
    return X, y

In [172]:
feature_labels =  [
 'tree_size',
 'max_width',
 'max_depth',
 'number_of_unique_users',
 'toxicity_ratio',
 'assortativity',
 'avg_toxicity_distance',
 'wiener_index',
 'is_root_toxic',
 'last_comments_diff_seconds']

target_label = "is_toxic"

evaluation_metrics = ["accuracy", "roc_auc", "f1", "precision", "recall"]

number_of_folds = 10
seed = 42

GBRT_pipe = Pipeline([
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
    ("std", StandardScaler(copy=True, with_mean=True, with_std=True)),
    ("clf", LGBMClassifier(random_state = seed))
])

GBRT_stratified_k_fold = StratifiedKFold(n_splits = number_of_folds, 
                                       shuffle = True, 
                                       random_state = seed)

GBRT_grid = {
    "clf__n_estimators": [10, 50, 100, 1000]
}

GBRT_CV = GridSearchCV(
    estimator=GBRT_pipe, 
    param_grid=GBRT_grid, 
    cv=GBRT_stratified_k_fold,
    scoring= "accuracy",
    refit=True,
)

### Extract pairs of toxic/non toxic nodes from each conversation in the three bins
Moreover, we will be careful to cover the entire children index spectrum 

In [173]:
bins = [df_bin_10_100, df_bin_100_1000, df_bin_1000_10000]
# bins_to_test = [bin_10_100_test, bin_100_1000_test, bin_1000_10000_test]
min_bin = 10

df_complete_importances = pd.DataFrame()
print(f"{social} {topic} Classification Results")
for bin in bins:
    i = 0
    max_bin = min_bin * 10

    if min_bin == 10:
        print(f"Bin [{min_bin}, {max_bin}]")
    else:
        print(f"Bin ({min_bin}, {max_bin}]")   
    
    X, Y = create_dataset(bin, feature_labels, target_label)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        random_state=seed, 
                                                        test_size = 0.20, 
                                                        shuffle = True,
                                                        stratify=Y)

    print("Train size: " + str(X_train.shape[0]))
    print("Test size: " + str(X_test.shape[0]))

    GBRT_CV = GBRT_CV.fit(X_train, Y_train)

    Y_pred = GBRT_CV.predict(X_test)

    print()
    print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
    print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
    print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
    print('Test F1: %3f' % f1_score(Y_test, Y_pred))
    print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))
    confusion_matrix(Y_test, Y_pred)
    print()

    print(" Results from Grid Search " )
    print("\n The best estimator across ALL searched params:\n",GBRT_CV.best_estimator_)
    print("\n The best score across ALL searched params:\n",GBRT_CV.best_score_)
    print("\n The best parameters across ALL searched params:\n",GBRT_CV.best_params_)


    feature_importance = GBRT_CV.best_estimator_["clf"].feature_importances_
    sorted_idx = np.argsort(feature_importance)
    result = permutation_importance(
        GBRT_CV, X_test, Y_test, n_repeats=10, random_state=42, n_jobs=2
    )
    labels = np.array(feature_labels)[sorted_idx]

    df_bin_importance = pd.DataFrame(result.importances[sorted_idx].T, columns = labels)
    df_bin_importance["social"] = social
    df_bin_importance["interval"] = f"Bin [{min_bin}, {max_bin}]"
    
    df_complete_importances = pd.concat([df_complete_importances, df_bin_importance])
    min_bin *= 10
    i = i + 1

twitter football Classification Results
Bin [10, 100]
Train size: 17201
Test size: 4301

Test Accuracy: 0.848175
Test Precision: 0.776016
Test Recall: 0.949279
Test F1: 0.853948
Test AUC: 0.854334

 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(n_estimators=50, random_state=42))])

 The best score across ALL searched params:
 0.8400679702174235

 The best parameters across ALL searched params:
 {'clf__n_estimators': 50}
Bin (100, 1000]
Train size: 12801
Test size: 3201

Test Accuracy: 0.728835
Test Precision: 0.725869
Test Recall: 0.880791
Test F1: 0.795861
Test AUC: 0.690786

 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(random_state=42))])

 The best score across ALL searched params:
 0.7208036324160811


### Overall

In [174]:
X, Y = create_dataset(df_paired_tweets, feature_labels, target_label)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    random_state=seed, 
                                                    test_size=0.20, 
                                                    shuffle = True,
                                                    stratify=Y)
print("Train size: " + str(X_train.shape[0]))
print("Test size: " + str(X_test.shape[0]))
GBRT_CV = GBRT_CV.fit(X_train, Y_train)
Y_pred = GBRT_CV.predict(X_test)

print()
print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
print('Test F1: %3f' % f1_score(Y_test, Y_pred))
print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",GBRT_CV.best_estimator_)
print("\n The best score across ALL searched params:\n",GBRT_CV.best_score_)
print("\n The best parameters across ALL searched params:\n",GBRT_CV.best_params_)

feature_importance = GBRT_CV.best_estimator_["clf"].feature_importances_
sorted_idx = np.argsort(feature_importance)
result = permutation_importance(
    GBRT_CV, X_test, Y_test, n_repeats=10, random_state=42, n_jobs=2
)
labels = np.array(feature_labels)[sorted_idx]

df_bin_importance = pd.DataFrame(result.importances[sorted_idx].T, columns = labels)
df_bin_importance["social"] = social
df_bin_importance["interval"] = "overall"
df_complete_importances = pd.concat([df_complete_importances, df_bin_importance])

confusion_matrix(Y_test, Y_pred)
print()

Train size: 42169
Test size: 10543

Test Accuracy: 0.836669
Test Precision: 0.781087
Test Recall: 0.935496
Test F1: 0.851347
Test AUC: 0.836678
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(n_estimators=50, random_state=42))])

 The best score across ALL searched params:
 0.8331239293471487

 The best parameters across ALL searched params:
 {'clf__n_estimators': 50}



In [175]:
output_filename = f"/media/gabett/Volume/data-repository/panconesi-football-elections/ML/{social}_{topic}_feature_importances.csv"
df_complete_importances.to_csv(output_filename)

# Testing against other topic

## Load data

In [1]:
dataset_filename = f"/media/gabett/Volume/data-repository/panconesi-football-elections/{adversarial_topic}/{social}/trees/{social}_{adversarial_topic}_all_graphs_unified.parquet"

if social == "youtube":
    thread_identifier = 'video_id'
else:
    thread_identifier = 'conversation_id'

df_data = pd.read_parquet(dataset_filename)

# We remove all rows without a toxic label
df_data = df_data[df_data['is_toxic'].notna()]
print(df_data.shape)
print(df_data.dtypes)

df_data.head()

# Is the root toxic?
thread_roots = df_data[df_data.children_index == 1]
thread_roots.loc[:, "is_root_toxic"] = thread_roots.toxicity_score > 0.6
thread_roots = thread_roots[[thread_identifier, "is_root_toxic"]]

df_data = df_data.merge(thread_roots, 'inner', left_on= thread_identifier, right_on = thread_identifier,  suffixes= (None, "_y"))
df_data['created_at'] = pd.to_datetime(df_data['created_at'])
df_data['last_comments_diff_seconds'] = df_data.groupby(thread_identifier)['created_at'].diff().dt.total_seconds().fillna(0)

df_paired_tweets = pd.DataFrame({}, 
                                columns=df_data.columns)

df_data_to_evaluate = df_data[(df_data["toxicity_score"] <= 0.2) | (df_data["toxicity_score"] >= 0.6)]
number_of_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True].shape[0]
number_of_non_toxic_tweets = df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False].shape[0]

downsample_size = min(number_of_non_toxic_tweets, number_of_toxic_tweets)
minority_class = np.argmin([number_of_non_toxic_tweets, number_of_toxic_tweets])

if minority_class == 0: # non toxic
    df_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True],
                                         n_samples = number_of_non_toxic_tweets,
                                         random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets,
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False], df_toxic_tweets_resampled], ignore_index=True)],
                                  ignore_index=True)
else:
    df_non_toxic_tweets_resampled = resample(df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == False],
                                             n_samples = number_of_toxic_tweets,
                                             random_state=seed)
    
    
    df_paired_tweets = pd.concat([df_paired_tweets, 
                                  pd.concat([df_data_to_evaluate[df_data_to_evaluate['is_toxic'] == True], df_non_toxic_tweets_resampled], ignore_index=True)], 
                                  ignore_index=True)
    
df_paired_tweets.is_toxic.value_counts()

df_bin_10_100 = df_paired_tweets.query("children_index >= 10 & children_index <= 100")
df_bin_100_1000 = df_paired_tweets.query("children_index > 100 & children_index <= 1000")
df_bin_1000_10000 = df_paired_tweets.query("children_index > 1000 & children_index <= 10000")

NameError: name 'adversarial_topic' is not defined

## Testing against adversarial topic test set

In [None]:
bins = [df_bin_10_100, df_bin_100_1000, df_bin_1000_10000]
# bins_to_test = [bin_10_100_test, bin_100_1000_test, bin_1000_10000_test]
min_bin = 10


print(f"{social} {adversarial_topic} Classification Results")
for bin in bins:
    i = 0
    max_bin = min_bin * 10

    if min_bin == 10:
        print(f"Bin [{min_bin}, {max_bin}]")
    else:
        print(f"Bin ({min_bin}, {max_bin}]")   
    
    X, Y = create_dataset(bin, feature_labels, target_label)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                        random_state=seed, 
                                                        test_size = 0.20, 
                                                        shuffle = True,
                                                        stratify=Y)

    print("Test size: " + str(X_test.shape[0]))

    Y_pred = GBRT_CV.predict(X_test)

    print()
    print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
    print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
    print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
    print('Test F1: %3f' % f1_score(Y_test, Y_pred))
    print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))
    confusion_matrix(Y_test, Y_pred)
    print()

    print(" Results from Grid Search " )
    print("\n The best estimator across ALL searched params:\n",GBRT_CV.best_estimator_)
    print("\n The best score across ALL searched params:\n",GBRT_CV.best_score_)
    print("\n The best parameters across ALL searched params:\n",GBRT_CV.best_params_)

    min_bin *= 10
    i = i + 1

youtube elections Classification Results
Bin [10, 100]
Test size: 13596

Test Accuracy: 0.729185
Test Precision: 0.634314
Test Recall: 0.981913
Test F1: 0.770735
Test AUC: 0.746339

 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(n_estimators=50, random_state=42))])

 The best score across ALL searched params:
 0.8122439758723367

 The best parameters across ALL searched params:
 {'clf__n_estimators': 50}
Bin (100, 1000]
Test size: 22253

Test Accuracy: 0.626343
Test Precision: 0.575355
Test Recall: 0.965043
Test F1: 0.720908
Test AUC: 0.626297

 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(n_estimators=50, random_state=42))])

 The best score across ALL searched params:
 0.8122439758723367

 The best param

In [None]:
X, Y = create_dataset(df_paired_tweets, feature_labels, target_label)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    random_state=seed, 
                                                    test_size=0.20, 
                                                    shuffle = True,
                                                    stratify=Y)

print("Test size: " + str(X_test.shape[0]))

Y_pred = GBRT_CV.predict(X_test)

print()
print('Test Accuracy: %3f' % accuracy_score(Y_test, Y_pred))
print('Test Precision: %3f' % precision_score(Y_test, Y_pred))
print('Test Recall: %3f' % recall_score(Y_test, Y_pred))
print('Test F1: %3f' % f1_score(Y_test, Y_pred))
print('Test AUC: %3f' % roc_auc_score(Y_test, Y_pred))

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",GBRT_CV.best_estimator_)
print("\n The best score across ALL searched params:\n",GBRT_CV.best_score_)
print("\n The best parameters across ALL searched params:\n",GBRT_CV.best_params_)

confusion_matrix(Y_test, Y_pred)
print()

Test size: 44594

Test Accuracy: 0.681302
Test Precision: 0.614301
Test Recall: 0.974391
Test F1: 0.753538
Test AUC: 0.681302
 Results from Grid Search 

 The best estimator across ALL searched params:
 Pipeline(steps=[('imputer', SimpleImputer()), ('std', StandardScaler()),
                ('clf', LGBMClassifier(n_estimators=50, random_state=42))])

 The best score across ALL searched params:
 0.8122439758723367

 The best parameters across ALL searched params:
 {'clf__n_estimators': 50}

