# Preprocessing Twibot-20 Dataset
Following the pre-processing steps as mentioned on "Deep Neural Networks for Bot Detection" paper.

## Import libraries

In [10]:
import os

# Torch 
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

# Reading Data
import json
import pandas as pd

# NLP
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
from gensim.models.keyedvectors import KeyedVectors
import emoji

## Download Tokenizer

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\djsal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\djsal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

## Pre Processing Data for each set (test, train, validate) and returning metadata tensor, tweet glove embeddings tensor, and labels tensor for each set.

In [3]:
def load_glove_embeddings(glove_file_path):
        embeddings = {}
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings

def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
def embed_text(text, glove_embeddings):
        # Tokenize the text
        tokens = word_tokenize(text)
        embedding = []
        
        # Get GloVe embeddings for each word in the tokenized text
        for word in tokens:
            
            # Applying Kudugunta's Rules
            if word == '#':
                
                word = "<hashtag>"
            
            elif word == '@':
                
                word = "<user>"
            
            elif word == "https" or word == "HTTPS":
                
                word = "<url>"
            
            elif word[0:3] == "//t":
                
                word = "<url>"
            
            elif word.isdigit() or is_float(word):
                
                word = "<number>"
                
            # Replacing emojis

            if emoji.is_emoji(word):
                word = emoji.demojize(word)
                word = "<" + word[1:-1] + ">"

                            
            # For word in all caps
            if word.isupper():
                
                first_word = word.lower()
                
                if first_word in glove_embeddings:
                    embedding.append(glove_embeddings[first_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
                    
                second_word = "<allcaps>"
                
                if second_word in glove_embeddings:
                    
                    embedding.append(glove_embeddings[second_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
            
            else: 
                word = word.lower()
                
                if word in glove_embeddings:
                    embedding.append(glove_embeddings[word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional

        # Return the average embedding for the entire sentence (or you can return a list of vectors)
        if embedding:
            return np.mean(embedding, axis=0)
        else:
            return np.zeros(200)  # Default if no tokens are found in the embeddings
        
def encode_verified(row):
    
    row = row.strip()
    if row == "False":
        
        return  0
    
    else:
        
        return 1

In [4]:
def process_data(data_file_path):
    
    # Reading from json file
    # Open and read the JSON file
    with open(data_file_path, 'r') as file:
        data = json.load(file)
    
    # Flatten the JSON using json_normalize
    flattened_data = pd.json_normalize(data)

    # Convert the flattened data into a Pandas DataFrame
    df = pd.DataFrame(flattened_data)
    
    # Getting relevant columns
    numerical_cols = ['profile.followers_count', 'profile.friends_count',
                  'profile.favourites_count', 'profile.listed_count']

    categorical_cols = ['profile.verified']

    text_cols = ['tweet']

    labels = ['label']

    relevant_cols = numerical_cols + categorical_cols + text_cols + labels

    df_relevant = df[relevant_cols]
    
    # Exploding required columns
    df_relevant_explode_tweets = df_relevant.explode('tweet')

    df_relevant = df_relevant_explode_tweets.copy()
    
    """
    Not using domain since it's not in elections

    # Getting dummy variables for categorical
    dummies = pd.get_dummies(df_relevant['domain'], drop_first = True)
    
    df_relevant = pd.concat([df_relevant, dummies], axis = 1)
    
    # dropping original categorical columns
    df_relevant = df_relevant.drop(columns = ['domain'], axis = 1)
    """
    # Getting correct data types
    
    # Numerical Data Types
    df_relevant = df_relevant.copy()
    df_relevant['profile.followers_count'] = df_relevant['profile.followers_count'].astype(float)
    df_relevant['profile.friends_count'] = df_relevant['profile.friends_count'].astype(float)
    df_relevant['profile.favourites_count'] = df_relevant['profile.favourites_count'].astype(float)
    df_relevant['profile.listed_count'] = df_relevant['profile.listed_count'].astype(float)

    # Categorical Types
    df_relevant['profile.verified'] = df_relevant['profile.verified'].astype(str)

    # Tweet
    df_relevant['tweet'] = df_relevant['tweet'].astype(str)

    # Labels
    df_relevant['label'] = df_relevant['label'].astype(int)

    # Encoding verified
    df_relevant['profile.verified'] = df_relevant['profile.verified'].apply(encode_verified)
    
    # Tokenizing and Getting glove embeddings for tweets
    tweets_df = df_relevant.copy()
    
    # Loading Glove Embeddings
    glove_embeddings = load_glove_embeddings('../Data/glove.6B.200d.txt')

    # Apply embedding to the 'Text' column
    tweets_df['glove_emb'] = tweets_df['tweet'].apply(lambda x: embed_text(x, glove_embeddings))
    
    embedding_list = np.vstack(tweets_df['glove_emb'].values)

    # EMBEDDINGS TENSOR
    tweet_glove_embeddings = torch.tensor(embedding_list)
    
    # METADATA TENSOR
    df_num_cat = df_relevant.copy()
    df_num_cat = df_num_cat.drop(columns = ['tweet', 'label'])
    
    metadata_tensor = torch.tensor(df_num_cat.values)
    
    # LABEL TENSOR
    label_tensor = torch.tensor(df_relevant['label'].values)
    
    return tweet_glove_embeddings, metadata_tensor, label_tensor

## Getting all tensors (test, train, validate)

In [5]:
test_tweet_emb, test_metadata_tensor, test_label = process_data('../Data/test.json')
print(test_tweet_emb.shape, test_metadata_tensor.shape, test_label.shape)
print("Finished Processing Test Data")

torch.Size([199863, 200]) torch.Size([199863, 5]) torch.Size([199863])
Finished Processing Test Data


In [6]:
train_tweet_emb, train_metadata_tensor, train_label = process_data('../Data/train.json')
print(train_tweet_emb.shape, train_metadata_tensor.shape, train_label.shape)
print("Finished Processing Train Data")

torch.Size([1398465, 200]) torch.Size([1398465, 5]) torch.Size([1398465])
Finished Processing Train Data


In [7]:
validate_tweet_emb, validate_metadata_tensor, validate_label = process_data('../Data/dev.json')
print(validate_tweet_emb.shape, validate_metadata_tensor.shape, validate_label.shape)
print("Finished Processing Validate Data")

torch.Size([401540, 200]) torch.Size([401540, 5]) torch.Size([401540])
Finished Processing Validate Data


In [28]:
newpath = r"../Data/Processed_Data"

if not os.path.exists(newpath):
    os.makedirs(newpath) 
    print(f"Directory created: {newpath}")
else:
    print(f"Directory already exists: {newpath}")

Directory already exists: ../Data/Processed_Data


In [27]:
# Saving tensor to files

# Test
torch.save(test_tweet_emb, '../Data/Processed_Data/test_tweet_emb_tensor.pth')
torch.save(test_metadata_tensor, '../Data/Processed_Data/test_metadata_tensor.pth')
torch.save(test_label, '../Data/Processed_Data/test_label_tensor.pth')

# Train
torch.save(train_tweet_emb, '../Data/Processed_Data/train_tweet_emb_tensor.pth')
torch.save(train_metadata_tensor, '../Data/Processed_Data/train_metadata_tensor.pth')
torch.save(train_label, '../Data/Processed_Data/train_label_tensor.pth')

# Validate
torch.save(validate_tweet_emb, '../Data/Processed_Data/validate_tweet_emb_tensor.pth')
torch.save(validate_metadata_tensor, '../Data/Processed_Data/validate_metadata_tensor.pth')
torch.save(validate_label, '../Data/Processed_Data/validate_label_tensor.pth')