### **SETUP**

In [99]:
!pip install emoji



In [211]:
import pandas as pd
import numpy as np
import nltk
import json
import torch
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

from keras import optimizers, losses, activations, models
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.layers import Dense, Input, Dropout, Convolution1D, MaxPool1D, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate
from sklearn.metrics import f1_score, accuracy_score

import tensorflow as tf
from keras import Input, Model, layers
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras import losses
from tensorflow import keras
from keras.callbacks import EarlyStopping

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras import regularizers
from keras.utils import np_utils
from keras import utils
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import re

import emoji

import os

import datetime
from datetime import timezone

In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [102]:
PATH = '/content/drive/MyDrive/GESS/Twibot-20/{}.json'
DATE_REF = "2021-01-01"

### **READ AND PRE-PROCESS THE DATA**

In [6]:
def strip_emoji(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text
def remove_links(text):
    return re.sub(r'http\S+', '', text)

In [254]:
def extract_following(neighbors: dict):
    if neighbors is None:
        return np.nan
    return neighbors.get('following', np.nan)

def extract_followers(neighbors: dict):
    if neighbors is None:
        return np.nan
    return neighbors.get('follower', np.nan) 
    

def build_df(filename: str):
    
    with open(PATH.format(filename)) as f:
        data = json.load(f)
        
    df = pd.DataFrame.from_dict(data)
    df_profile = pd.DataFrame.from_dict(df['profile'].values.tolist())
    df = pd.concat([ df, df_profile ], axis=1)

    df = df[[
        'id', 'screen_name', 'location', 'followers_count',
        'friends_count', 'statuses_count', 'created_at',
        'tweet', 'label'
    ]]

    df['id'] = df['id'].astype(str).str.strip()
    df['screen_name'] = df['screen_name'].astype(str).str.strip()
    df['location'] = df['location'].astype(str).str.strip()

    # scale the follower count between 0 and 1
    x = df['followers_count'].astype(int).values.reshape(-1,1) 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df['followers_count'] = pd.DataFrame(x_scaled)

    # scale the friends count between 0 and 1
    x = df['friends_count'].astype(int).values.reshape(-1,1) 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df['friends_count'] = pd.DataFrame(x_scaled)
    
    # scale the statuses count between 0 and 1
    x = df['statuses_count'].astype(int).values.reshape(-1,1) 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df['statuses_count'] = pd.DataFrame(x_scaled)
    
    df.loc[df['location'] == "", 'location'] = np.nan
    le = LabelEncoder()
    le.fit(df['location'].unique())
    df['location_enc'] = le.transform(df['location'])
    
    # compute the number of seconds ellapsed since 1 January 2005 form when the
    # account has been created
    seconds = pd.to_datetime(df['created_at'], utc=True)
    seconds = seconds - datetime.datetime(2005, 1, 1, tzinfo=timezone.utc)
    seconds = seconds.map(lambda x: x.total_seconds())
    df['created_at'] = seconds
    # scale these values between 0 and 1
    x = df['created_at'].values.reshape(-1,1) 
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df['created_at'] = pd.DataFrame(x_scaled)

    df = df.explode('tweet').reset_index(drop=True)
    df['tweet'] = df['tweet'].astype(str).str.strip()
    df['label'] = df['label'].astype(int)
    
    print(df.shape)
    return df

In [255]:
# build the training and test set
df_train = build_df('train')
df_test = build_df('test')

(1398465, 10)
(199863, 10)


In [257]:
df_train.head()

Unnamed: 0,id,screen_name,location,followers_count,friends_count,statuses_count,created_at,tweet,label,location_enc
0,17461978,SHAQ,"Orlando, FL",0.136558,0.000159,0.001906,0.184182,RT @CarnivalCruise: 🎉 Are you ready to see wha...,0,1872
1,17461978,SHAQ,"Orlando, FL",0.136558,0.000159,0.001906,0.184182,Who has time for receipts? Not me. @epson rece...,0,1872
2,17461978,SHAQ,"Orlando, FL",0.136558,0.000159,0.001906,0.184182,Steady wants to encourage you to invest in you...,0,1872
3,17461978,SHAQ,"Orlando, FL",0.136558,0.000159,0.001906,0.184182,"Good one, @rishid. But let’s see if y'all can ...",0,1872
4,17461978,SHAQ,"Orlando, FL",0.136558,0.000159,0.001906,0.184182,#lsunationalchamps,0,1872


In [258]:
# drop some columns not needed for training for the training set
df_train_users = df_train.drop([ 'tweet', 'location', 'screen_name', 'id' ], axis=1).drop_duplicates()
df_train_users = df_train_users.reset_index(drop=True)
print(df_train_users.shape)
df_train_users.head()

(8278, 6)


Unnamed: 0,followers_count,friends_count,statuses_count,created_at,label,location_enc
0,0.136558,0.000159,0.001906,0.184182,0,1872
1,0.0,1e-05,0.0,0.997725,1,3130
2,0.006787,0.000109,0.001074,0.185952,0,876
3,0.002914,0.001106,0.037524,0.164524,0,2440
4,0.000119,0.000149,2e-05,0.510992,1,1185


In [259]:
# drop some columns not needed for training for the test set
df_test_users = df_test.drop([ 'tweet', 'location', 'screen_name', 'id' ], axis=1).drop_duplicates()
df_test_users = df_test_users.reset_index(drop=True)
print(df_test_users.shape)
df_test_users.head()

(1183, 6)


Unnamed: 0,followers_count,friends_count,statuses_count,created_at,label,location_enc
0,0.0001900731,0.021061,0.05787,0.937419,1,283
1,1.0,6.2e-05,0.004151,0.248942,0,479
2,0.001853396,0.000449,0.085817,0.244743,0,330
3,1.030765e-07,0.000675,2e-06,0.997052,1,409
4,0.326558,0.001052,0.013893,0.456113,0,356


In [260]:
# from pandas to numpy
y_train = df_train_users["label"].to_numpy()
X_train = df_train_users.drop(['label'], axis=1).to_numpy()

y_test = df_test_users["label"].to_numpy()
X_test = df_test_users.drop(['label'], axis=1).to_numpy()

print(X_train.shape)
print(X_test.shape)

(8278, 5)
(1183, 5)


### **MODELS**

In [269]:
def predict_and_evaluate(model, y_test):
  y_pred = model.predict(X_test)

  f1 = f1_score(y_test, y_pred, average="macro")
  acc = accuracy_score(y_test, y_pred)

  return f1, acc

  

In [262]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)

model.fit(X_train, y_train)
f1, acc = predict_and_evaluate(model)

print("Test f1 score : %s "% f1)
print("Test accuracy score : %s "% acc)

Test f1 score : 0.7105102165667443 
Test accuracy score : 0.7295012679628065 


In [263]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.3, max_depth=1, random_state=0)

model.fit(X_train, y_train)
f1, acc = predict_and_evaluate(model)

print("Test f1 score : %s "% f1)
print("Test accuracy score : %s "% acc)

Test f1 score : 0.7124983849433654 
Test accuracy score : 0.7328825021132713 


In [265]:
from sklearn import svm
from sklearn.gaussian_process.kernels import *
import sklearn.gaussian_process as gp

model = svm.SVC(kernel='rbf', C=30)

model.fit(X_train, y_train)
f1, acc = predict_and_evaluate(model)

print("Test f1 score : %s "% f1)
print("Test accuracy score : %s "% acc)

Test f1 score : 0.3510696653867252 
Test accuracy score : 0.5409974640743872 


In [266]:
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train, y_train)
f1, acc = predict_and_evaluate(model)

print("Test f1 score : %s "% f1)
print("Test accuracy score : %s "% acc)

Test f1 score : 0.7200703295087578 
Test accuracy score : 0.738799661876585 


In [289]:
from tensorflow.keras import utils

n_classes = 2
y_train_nn = utils.to_categorical(y_train, n_classes)

# create the neural network
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


callbacks = [ReduceLROnPlateau(monitor="val_loss", mode="max", patience=3, verbose=2),
             EarlyStopping(monitor="val_loss", mode="max", patience=10, verbose=1)]
model.fit(X_train, y_train_nn, epochs=100, callbacks=callbacks, validation_split=0.2, batch_size=64)

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)

f1 = f1_score(y_test, y_pred, average="macro")
acc = accuracy_score(y_test, y_pred)

# f1, acc = predict_and_evaluate(model, y_test)

print("Test f1 score : %s "% f1)
print("Test accuracy score : %s "% acc)

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_106 (Dense)           (None, 50)                300       
                                                                 
 dense_107 (Dense)           (None, 100)               5100      
                                                                 
 dropout_29 (Dropout)        (None, 100)               0         
                                                                 
 dense_108 (Dense)           (None, 150)               15150     
                                                                 
 dropout_30 (Dropout)        (None, 150)               0         
                                                                 
 dense_109 (Dense)           (None, 200)               30200     
                                                                 
 dropout_31 (Dropout)        (None, 200)             