In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install emoji
!pip install unidecode
!pip install emojis

## install the necessary Tools 🧰

In [None]:
import os
import random
import re
import gc
import emoji
import emojis
import zipfile
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from datasets import Dataset
from transformers import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from unidecode import unidecode

In [None]:
np.random.seed(42)

## Utility functions to process the tweets 🔧 See Florians Notebook for more [link]

In [None]:
def strip_emoji(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text
def remove_links(text):
    return re.sub(r'http\S+', '', text)

In [None]:
def extract_reply(text: str):
    res = re.match(r'(@\w+ )+', text)
    if res:
        text = text[res.span()[1]:]
        return text, 1, res.group()[1:-1].split(" @")
    return text, 0, np.nan

def extract_retweet(text: str):
    res = re.match(r'RT @(\w+): \.?', text)
    if res:
        text = text[res.span(0)[1]:]
        return text, 1, res.group(1)
    return text, 0, np.nan

def extract_modified_tweet(text: str):
    res = re.match(r'MT : ', text)
    if res:
        text = text[res.span(0)[1]:]
        return text, 1
    return text, 0

def extract_symbols(text: str, symbol: str):
    symbols = [ symbol[1:] for symbol in re.findall(fr"{symbol}\w+\b", text) ]
    text = re.sub(fr"{symbol}(?=\w+\b)", "", text)
    return text, symbols

def extract_links(text: str):
    links = re.findall("https?://.+?(?=(?:\s|$))", text)
    for link in links:
        text = text.replace(link, "")
    return text, links

def extract_ellipsis(text: str):
    text_cleaned = re.sub(r'\s*(?:\.{3}|…|\(\d+\/\d+\))\s*$', "", text)
    has_ellipsis = len(text_cleaned) != len(text)
    return text_cleaned, has_ellipsis

# TODO: Get emojis category? (use emojis, or advertools)
def extract_emojis(text: str):
    # Doesn't get all emojis...
    text = emojis.decode(text)
    emoji_list = re.findall(r":(.+?):", text)
    for emoji in emoji_list:
        text = text.replace(f":{emoji}:", "")
    return text, emoji_list
    
def clean_text(text: str):
    text_cleaned = unidecode(text)
    unk_chars_ratio = len(text_cleaned) / len(text) if text else 0
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned)
    return text_cleaned, unk_chars_ratio


def extract_artefacts(text: str):
    text, is_reply, replies = extract_reply(text)
    text_sat = len(text) / 240
    
    text, is_rt, retweet = extract_retweet(text)
    text, is_mt = extract_modified_tweet(text)
    text, hashtags = extract_symbols(text, "#")
    text, mentions = extract_symbols(text, "@")
    text, cashtags = extract_symbols(text, "$")
    text, links = extract_links(text)
    
    text, has_ellipsis = extract_ellipsis(text)
    text, emoji_list = extract_emojis(text)
    text, unk_chars_ratio = clean_text(text)
    
    return (
        text, text_sat, is_reply, replies, 
        is_rt, retweet, is_mt, 
        hashtags, mentions, cashtags, 
        links, has_ellipsis, emoji_list, 
        unk_chars_ratio
    )


def get_artefacts(df: pd.DataFrame):    
    new_cols = [ 
        'tweet', 'text_sat', 'is_reply', 'replies', 
        'is_rt', 'retweet', 'is_mt', 
        'hashtags', 'mentions', 'cashtags',
        'links', 'has_ellipsis', 'emojis', 
        'unk_chars_ratio'
    ]
    df[new_cols] = pd.DataFrame(
        df['tweet'].apply(extract_artefacts).to_list(), 
        columns=new_cols
    )
    print(df['hashtags'].head())
    df['hashtags_count'] = df['hashtags'].apply(len)
    df['mentions_count'] = df['mentions'].apply(len)
    df['cashtags_count'] = df['cashtags'].apply(len)
    df['links_count'] = df['links'].apply(len)
    df['emojis_count'] = df['emojis'].apply(len)
    
    return df

In [None]:
# set the maximum character length here it can be relatively short after stripping none of the tweets are very long
max_length = 128

## Here we extract, unpack and load the dataset 🎁

In [None]:
zip_path = 'Twibot-20.zip'
twibot_path = 'twibot/'
try:
  os.mkdir(twibot_path)
except:
  print('exists')
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(twibot_path)

In [None]:
with open("twibot/Twibot-20/train.json") as f:
    train_json = json.load(f)
    train_df = pd.json_normalize(train_json, sep = '_')

## Each entry is a list of 200 tweets, we want 1 tweet per row instead, the following function call accomplishes this. 💥

In [None]:
train_df = train_df.explode('tweet')

## Look inside 👀

In [None]:
#some users change name
train_df[['ID', 'profile_id', 'profile_name']].nunique()

ID              8278
profile_id      8278
profile_name    8213
dtype: int64

In [None]:
train_df.head()

Unnamed: 0,ID,tweet,neighbor,domain,label,profile_id,profile_id_str,profile_name,profile_screen_name,profile_location,...,profile_profile_link_color,profile_profile_sidebar_border_color,profile_profile_sidebar_fill_color,profile_profile_text_color,profile_profile_use_background_image,profile_has_extended_profile,profile_default_profile,profile_default_profile_image,neighbor_following,neighbor_follower
0,17461978,RT @CarnivalCruise: 🎉 Are you ready to see wha...,,"[Politics, Business, Entertainment]",0,17461978,17461978,SHAQ,SHAQ,"Orlando, FL",...,2FC2EF,181A1E,252429,666666,True,False,False,False,,
0,17461978,Who has time for receipts? Not me. @epson rece...,,"[Politics, Business, Entertainment]",0,17461978,17461978,SHAQ,SHAQ,"Orlando, FL",...,2FC2EF,181A1E,252429,666666,True,False,False,False,,
0,17461978,Steady wants to encourage you to invest in you...,,"[Politics, Business, Entertainment]",0,17461978,17461978,SHAQ,SHAQ,"Orlando, FL",...,2FC2EF,181A1E,252429,666666,True,False,False,False,,
0,17461978,"Good one, @rishid. But let’s see if y'all can ...",,"[Politics, Business, Entertainment]",0,17461978,17461978,SHAQ,SHAQ,"Orlando, FL",...,2FC2EF,181A1E,252429,666666,True,False,False,False,,
0,17461978,#lsunationalchamps\n,,"[Politics, Business, Entertainment]",0,17461978,17461978,SHAQ,SHAQ,"Orlando, FL",...,2FC2EF,181A1E,252429,666666,True,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8277,50471224,@jjcoop36 Players/coaches/staff are going to a...,,[Sports],1,50471224,50471224,Tim Carroll,timcarrollart,"Conway, SC",...,9266CC,000000,000000,000000,True,True,False,False,"[4202878276, 637216245, 129878018, 302066953, ...","[262113579, 160208100, 247813054, 41537856, 92..."
8277,50471224,"@MDMays62 Appreciate it, Mark!\n",,[Sports],1,50471224,50471224,Tim Carroll,timcarrollart,"Conway, SC",...,9266CC,000000,000000,000000,True,True,False,False,"[4202878276, 637216245, 129878018, 302066953, ...","[262113579, 160208100, 247813054, 41537856, 92..."
8277,50471224,@JPMXVI Thank you! They vary depending on amou...,,[Sports],1,50471224,50471224,Tim Carroll,timcarrollart,"Conway, SC",...,9266CC,000000,000000,000000,True,True,False,False,"[4202878276, 637216245, 129878018, 302066953, ...","[262113579, 160208100, 247813054, 41537856, 92..."
8277,50471224,@TwinsAlmanac @OldTimeHardball @RedSox @barsto...,,[Sports],1,50471224,50471224,Tim Carroll,timcarrollart,"Conway, SC",...,9266CC,000000,000000,000000,True,True,False,False,"[4202878276, 637216245, 129878018, 302066953, ...","[262113579, 160208100, 247813054, 41537856, 92..."


## Lets define the meta columns we need and make sure our tweets are strings without leading or trailing spaces

In [None]:
meta = ['ID', 'profile_name', 'profile_description', 'profile_followers_count', 'profile_friends_count']
train_df['tweet'] = train_df['tweet'].astype(str).str.strip()
train_df = train_df[meta + ['tweet', 'label']]
train_df = train_df.dropna()

## we also load in the ukraine dataset and filter it for certain bots from the text only model
The dataset is filtered by predictions with near 100% confidence and then preprocessed in the same way as Twibot. 
The result is about 300+ tweets.

In [None]:
ukraine = pd.read_parquet("/new_dataset.pq")
ukraine.head()

Unnamed: 0,userid,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetid,tweetcreatedts,retweetcount,text,hashtags,coordinates,favorite_count,extractedts,bot,no_bot,cryptoflag
0,1506957115495178252,240teabug2,,Україна,3,0,2,2022-03-24 11:32:57.000000,1507280557239009281,2022-03-25 08:57:43,0,.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n#art #s...,"[{'text': 'art', 'indices': [26, 30]}, {'text'...",,0,2022-03-25 09:22:38.117606,0.991931,0.008069,False
1,1508378411349196806,911_essays,"For online classes,research papers,term papers...",,6,0,5,2022-03-28 09:40:25.000000,1508389861107146754,2022-03-28 10:25:42,0,A+ assurance in your essay(s).\nWe deliver the...,"[{'text': 'assignments', 'indices': [65, 77]},...",,1,2022-03-28 10:45:41.615909,0.99661,0.00339,False
2,1495998812053254144,ABigFuckOffChi1,,,4,0,33,2022-02-22 05:48:20.000000,1507601286048460801,2022-03-26 06:12:11,0,@GBNEWS #gbnews @AJENews #ajenews @RT_com isn'...,"[{'text': 'gbnews', 'indices': [8, 15]}, {'tex...",,0,2022-03-26 06:33:51.600393,0.977394,0.022606,False
3,1495998812053254144,ABigFuckOffChi1,,,4,0,40,2022-02-22 05:48:20.000000,1508014242930298881,2022-03-27 09:33:07,0,@GBNEWS #gbnews @AJENews #ajenews @RT_com I ca...,"[{'text': 'gbnews', 'indices': [8, 15]}, {'tex...",,0,2022-03-27 09:45:27.080434,0.988801,0.011199,False
4,1497526750967996416,ABorodii,,,3,0,293,2022-02-26 10:59:47.000000,1508091264604028943,2022-03-27 14:39:11,0,@jreichelt Your country was selling weapons to...,"[{'text': 'cancelrussia', 'indices': [212, 225...",,0,2022-03-27 14:47:33.037389,0.958309,0.041691,False


In [None]:
ukraine = ukraine.drop(['acctdesc', 'location', 'totaltweets', 'usercreatedts', 'coordinates', 'hashtags', 'retweetcount', 'tweetcreatedts'], axis=1)
ukraine = ukraine.dropna()
ukraine['bot'] = ukraine['bot'].apply(round)
ukraine_bots = ukraine[ukraine.bot == 1]
ukraine_sample = ukraine[ukraine.bot == 0].sample(ukraine_bots.shape[0])
ukraine_sample = pd.concat([ukraine_sample.reset_index(drop = True), ukraine_bots], axis =0)
ukraine = ukraine_sample

In [None]:
new_cols = [ 
        'tweet', 'text_sat', 'is_reply', 'replies', 
        'is_rt', 'retweet', 'is_mt', 
        'hashtags', 'mentions', 'cashtags',
        'links', 'has_ellipsis', 'emojis', 
        'unk_chars_ratio'
    ]

In [None]:
meta_df = pd.DataFrame(train_df['tweet'].apply(extract_artefacts).to_list(), columns = new_cols).reset_index(drop = True)
meta_uk = pd.DataFrame(ukraine['text'].apply(extract_artefacts).to_list(), columns = new_cols).reset_index(drop = True)
ukraine = pd.concat([ukraine.reset_index(drop=True), meta_uk], axis=1)
train_df = pd.concat([train_df.reset_index(drop=True), meta_df], axis=1)

In [None]:
train_df['hashtags_count'] = train_df['hashtags'].apply(len)
train_df['mentions_count'] = train_df['mentions'].apply(len)
train_df['cashtags_count'] = train_df['cashtags'].apply(len)
train_df['links_count'] = train_df['links'].apply(len)
train_df['emojis_count'] = train_df['emojis'].apply(len)

In [None]:
ukraine.columns = ['ID', 'profile_name', 'profile_friends_count', 'profile_followers_count', 'tweetid', 'text',
       'favorite_count', 'extractedts', 'label', 'no_bot', 'cryptoflag', 'tweet',
       'text_sat', 'is_reply', 'replies', 'is_rt', 'retweet', 'is_mt',
       'hashtags', 'mentions', 'cashtags', 'links', 'has_ellipsis', 'emojis',
       'unk_chars_ratio']

In [None]:
ukraine['hashtags_count'] = ukraine['hashtags'].apply(len)
ukraine['mentions_count'] = ukraine['mentions'].apply(len)
ukraine['cashtags_count'] = ukraine['cashtags'].apply(len)
ukraine['links_count'] = ukraine['links'].apply(len)
ukraine['emojis_count'] = ukraine['emojis'].apply(len)

In [None]:
meta_cols = ["ID",
             "profile_followers_count",
             "profile_friends_count",
             "is_mt",
             "is_reply",
             "text_sat",
             "has_ellipsis",
             "unk_chars_ratio",
             "cashtags_count",
             "hashtags_count",
             "links_count",
             "mentions_count",
             "emojis_count"]

In [None]:
X = train_df[['tweet', 'label']]
X = X.iloc[:, 1:]
X_meta = train_df[meta_cols]
X = pd.concat([X.reset_index(drop = True), ukraine[['tweet', 'label']]], axis=0)
uk_meta = ukraine[meta_cols]
X_meta = pd.concat([X_meta.reset_index(drop = True), uk_meta], axis=0)

In [None]:
X.columns = ['texts', 'labels']
X.labels = X.labels.apply(int)
X.texts = X.texts.apply(strip_emoji)
X.texts = X.texts.apply(remove_links)

## We save everything to make sure to be able to access it again

In [None]:
ukraine.to_csv("ukraine_data.csv", index = False)

In [None]:
X.to_parquet('/twibot_ukraine.pq')
X_meta.to_csv("/twibot_ukraine_meta.csv")

## Next we create the Tokens and attention masks that the transformer needs, we also save a one hot encoded version of the labels

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_ckpt)

In [None]:
input_ids=[]
attention_masks=[]
tweets = X["texts"].to_numpy()
for tweet in tqdm(tweets):
    bert_inp=tokenizer.encode_plus(tweet,add_special_tokens = False, max_length =128, pad_to_max_length = True, return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
target = np.array(pd.get_dummies(X['labels']))

In [None]:
np.save('/datasocio/input_ids_twi_uk.npy', input_ids, allow_pickle = True)
np.save('/datasocio/attention_masks_twi_uk.npy', attention_masks, allow_pickle = True)
np.save('/datasocio/target_twi_uk.npy', target, allow_pickle = True)