Install some necessary libraries

In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece

In [None]:
import os
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import pandas as pd
from datasets import Dataset
from transformers import *
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import gc

# 💾 Here we load in the preprocessed meta dataset with features created and explained in Florians Notebook[insert link]. 

In [None]:
X_meta = pd.read_csv("/content/drive/MyDrive/datasocio/twibot_ukraine_meta.csv", index_col = 0)

## lets look inside: 👀

In [None]:
X_meta.head()

Unnamed: 0,ID,profile_followers_count,profile_friends_count,is_mt,is_reply,text_sat,has_ellipsis,unk_chars_ratio,cashtags_count,hashtags_count,links_count,mentions_count,emojis_count
0,17461978,15349596,692,0,0,0.583333,True,1.0,0,0,0,0,2
1,17461978,15349596,692,0,0,0.658333,False,1.0,0,0,2,1,0
2,17461978,15349596,692,0,0,1.266667,False,1.0,0,1,2,1,0
3,17461978,15349596,692,0,0,0.695833,False,1.0,0,2,2,1,0
4,17461978,15349596,692,0,0,0.075,False,1.0,0,1,0,0,0


## 🚧 We've found that the models can get quite biased and overfit on the followers and following data. 

We've found several solution for this. 
1. we added random noise and randomly added some followers here and there
2. we just drop it, since the predictions on pure text data and data with followers were very similar there might be little useful information here but a lot of risk of overfitting.


In [None]:
X_meta.drop(['profile_followers_count', 'profile_friends_count'],axis=1, inplace = True)

In [None]:
ids = X_meta.pop("ID") #we don't need it and don't want it, its a unique identifier => overfitting

## ⚙️ We set our configuration data here
Most of these are self explanatory, start_lr and min_lr are the learning rates to be used in the scheduler. 


In [None]:
batch_size = 64
start_lr = 5e-5
min_lr = 1e-8
epochs = 10
seed = 42

## 🤖 Here we specify our transformer to be used, distilbert is a smaller version of Bert but keeps most of the performance [insert link]

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_ckpt)

In [None]:
config = AutoConfig.from_pretrained(model_ckpt) 
backbone = TFDistilBertModel.from_pretrained(model_ckpt, config = config)

In [None]:
backbone.summary()

## 📜 These are the preprocessed and tokenized texts and targets that we load in.
The function converts these into a tf.data dataset to be consumed by our model.


In [None]:
input_ids = np.load('/content/drive/MyDrive/datasocio/input_ids_twi_uk.npy', allow_pickle = True)
attention_masks= np.load('/content/drive/MyDrive/datasocio/attention_masks_twi_uk.npy', allow_pickle = True)
target = np.load('/content/drive/MyDrive/datasocio/target_twi_uk.npy',allow_pickle = True)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
# Note that some tokenizers also returns 'token_id'. Modify this function accordingly. 
@tf.function
def parse_data(inputs, target):
    inputs_ids = inputs['input_ids']
    attention_mask = inputs['attention']
    meta = inputs['meta']
    target = tf.cast(target, tf.int32)
    
    return {'input_ids': input_ids,
            'attention_mask': attention_mask,
            'meta': meta}, target

## 🔥Here we define our model it is very simple above the transformer.
We feed the text data (tokens) into our backbone model and separately process the meta data to be concatenated.

In [None]:
def build_model():
    
    input_ids = tf.keras.Input(shape=(128,),dtype='int32', name = 'input_ids')
    attention_masks = tf.keras.Input(shape=(128,),dtype='int32', name = 'attention')
    meta = tf.keras.Input(shape=(X_meta.shape[1],),dtype='int32', name = 'meta') 
    
    output = backbone(input_ids,attention_masks)[0]
    
    output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(output)
    output = tf.keras.layers.GlobalMaxPool1D()(output)
    
    output = tf.keras.layers.Dense(16,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    meta_output = tf.keras.layers.Dense(16,activation='relu')(meta)
    concat = tf.keras.layers.concatenate([meta_output, output], name = "concat")
    
    output = tf.keras.layers.Dense(2,activation='softmax', name = "head")(concat)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks, meta],outputs = output)
    
    
    for layer in model.layers[:3]:
        print(layer)
        layer.trainable = False
    
    return model

In [None]:
model = build_model()

## Below we do a standard split for the validation data and we normalize the meta data.

In [None]:
followers = X_meta.to_numpy().astype('float32')
X_train, X_test, y_train, y_test,train_mask,test_mask, meta_train, meta_test =train_test_split(input_ids,
                                                                       target,
                                                                       attention_masks,
                                                                       followers,
                                                                       random_state = seed,
                                                                        test_size=0.3)
scaler = StandardScaler()
meta_train = scaler.fit_transform(meta_train)
meta_test = scaler.transform(meta_test)
#just some cleaning up below for memory
del input_ids, attention_masks, X_meta
gc.collect()

13320

## 🥄 This creates dataloaders for our train and validation data to be more efficiently fed into the model.

In [None]:
trainloader = tf.data.Dataset.from_tensor_slices(({'input_ids':X_train, 'attention':train_mask, 'meta': meta_train}, y_train))
testloader = tf.data.Dataset.from_tensor_slices(({'input_ids':X_test, 'attention':test_mask, 'meta': meta_test}, y_test))
trainloader = (
        trainloader
        .cache()
        .shuffle(4096)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
        
    )

testloader = (
        testloader
        .batch(512)
        .prefetch(AUTOTUNE)
    )

## Below we set up the simplest (but often effective) schedule Reduce Learning Rate on Plateau. 
When the model stagnates for "patience" number of epochs, it multiplies the learning rate by a factor "factor" up to min_lr.
Additionally we set up early stopping, our free lunch and checkpointing for safety.

This was used in the pretraining, for pseudo labelling we do it manually.

In [None]:
rlr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr= min_lr)
es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
opt = tf.keras.optimizers.Adam(lr = start_lr)
checkpoint_filepath = '/content/drive/MyDrive/datasocio/no_followers_checkpoint_ukraine/'
ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    save_freq = 'epoch',
    monitor='val_loss',
    mode='min',
    save_best_only=True)

  super(Adam, self).__init__(name, **kwargs)


# We compile and maybe load in weights from previous checkpoints (if available)

In [None]:
model.compile(loss='categorical_crossentropy',optimizer= opt,metrics=['categorical_accuracy', tf.keras.metrics.AUC()])

In [None]:
model.load_weights('/content/drive/MyDrive/datasocio/no_followerspseudo.h5')

Below we set an array of dates, each date corresponds to a subset of the ukrain data with month march day = date. Note 2627 is actually 26-28th.

In the pseudo labelling below, we will randomly select one of these subsets from the ukraine tweets and run a prediction, then a backwards pass on those predictions.

You can think of it as bootstrapping, officially this is part of self-supervised learning.

This is one of the crudest ways to do it, but for our data absolutely enough.
Most commonly one does this every step (batch), and slowly increases the effect of the loss from the pseudo-predicted dataset.

We could do this here too, but the subsets are small enough that our reasoning was this suffices.

The model was also already pre-trained a few epochs on only the twibot data.

In [None]:
dates = ["_420_", "_may_"]

In [None]:
for epoch in range(10):
  print(f"Epoch: {epoch}")
  for i in range(10):
    #get a subset next 4096 samples from labelled trainign data
    subset = trainloader.take(4096)
    # pick a subset of unlabelled data => day of ukraine tweets
    date = np.random.choice(dates)
    #prepare ukraine subset for prediction
    X_meta = pd.read_parquet(f"/content/drive/MyDrive/datasocio/data/ukraine_meta_mar{date}.pq")
    X_meta = X_meta.drop(['followers','following'], axis=1)
    input_ids=np.load(f"/content/drive/MyDrive/datasocio/data/input_ids_mar{date}.npy")
    attention_masks=np.load(f"/content/drive/MyDrive/datasocio/data/attention_masks_mar{date}.npy")
    meta = X_meta.to_numpy().astype(np.float32)
    meta = scaler.transform(meta)
    #predict
    pseudo = model.predict([input_ids, attention_masks, meta], batch_size = batch_size)
    #filer the predictions by those which model is rather sure about => more likely to be correct
    mask = (pseudo[:, 0] > 0.9) | (pseudo[:, 1] > 0.9)
    #fit on the filtered pseudo labels
    model.fit([input_ids[mask], attention_masks[mask], meta[mask]], y = np.around(pseudo[mask]), batch_size = 32)
    #fit on our trainign data
    model.fit(subset)
    #evaluate on labeled data every epoch
  model.evaluate(testloader)

In [None]:
#leakage in metadata, to be fixed
history=model.fit(trainloader,
                        validation_data = testloader,
                        batch_size=batch_size,
                        callbacks = [rlr, es, ckpt],
                        epochs=epochs)

In [None]:
model.save_weights('/content/drive/MyDrive/datasocio/no_followerspseudo.h5')