## Preprocess Tweets
This script takes the SemEval TaskA 2016 dataset and prepares it for the sentiment classification task

In [18]:
# Installs 
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModel

In [19]:
# Load data
DATASET_COLUMNS = ["label", "ids", "date", "flag", "user", "tweet"]
DATASET_ENCODING = "ISO-8859-1"
sentiment160_data = pd.read_csv("/home/ec2-user/SageMaker/data/sentiment160.csv", encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

In [22]:
sentiment160_data['tweet'][200:300]

200    Job Interview in Cardiff today, wish me luck! ...
201    @stustone Your show is whack. Way worse than w...
202    @DjAlizay I really don't think people choose t...
203    @eloquentembrace You're going to kill me, but ...
204    @cheechbud i think ur right!! hahaha!! 4.5 hrs...
                             ...                        
295    @aaronrva is in the bathroom and i have to pee...
296           @ashleyskyy but I wanted a margarita too! 
297    @KourtneyKardash yup night workouts r the wors...
298    Just called Hillsong again - they said they co...
299    has 0g of milky bar left, and around 200ml of ...
Name: tweet, Length: 100, dtype: object

In [15]:
train_data = sentiment160_data[:100000]['tweet'].tolist()
dev_data = sentiment160_data[100000:120000]['tweet'].tolist()

In [16]:
# tokenise data
from transformers import AutoTokenizer
MAX_SEQ = 32
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
encoded_sentiment160_train= tokenizer(train_data, padding=True, truncation=True, max_length=MAX_SEQ, return_tensors='pt')
encoded_sentiment160_dev= tokenizer(dev_data, padding=True, truncation=True, max_length=MAX_SEQ, return_tensors='pt')

In [17]:
# save the dataset
import pickle
train_file = '/home/ec2-user/SageMaker/data/train_sentiment160.pkl'
dev_file = '/home/ec2-user/SageMaker/data/dev_sentiment160.pkl'

with open(train_file, 'wb') as handle:
    pickle.dump(encoded_sentiment160_train, handle)
    
with open(dev_file, 'wb') as handle:
    pickle.dump(encoded_sentiment160_dev, handle)

In [None]:
# load data
sentiment_test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Masters Thesis/DistilLaBSE/data/twitter-2016train-A.txt', sep='\t', header=None)
sentiment_train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Masters Thesis/DistilLaBSE/data/twitter-2016test-A.txt', sep='\t', header=None)


In [3]:
# filter removes twitter handles, websites, pictures, links. removes hashtag symbol but keeps the word.
def filter(text):
    final_text = ''
    for word in text.split():
        if word.startswith('@'):
            continue
        elif word[-3:] in ['com', 'org']:
            continue
        elif word.startswith('pic') or word.startswith('http') or word.startswith('www'):
            continue
        elif word.startswith('#'):
          final_text += word.strip('#')+' '
        else:
            final_text += word+' '
    return final_text

In [4]:
# creates features for sentiment classifier using labse model
def feature_extraction(text, model):
    x = tokenizer(filter(text), padding=True, truncation=True, max_length=64, return_tensors='pt')
    #print(x)
    with torch.no_grad():
        model_output = model(**x.to(device))

    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    #print(type(embeddings))
    return list(embeddings.cpu().numpy())
    #return list(x[0][0].cpu().numpy())

In [5]:
# data prep pipeline
def data_prep(dataset, model):
    X = []
    y = []
    for index, row in tqdm(dataset.iterrows()):
        X.append(feature_extraction(text=row[2], model=model))
        y.append(row[1])
    return np.array(X), np.array(y)