In [16]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import pickle
import PIL
import string
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [17]:
path ='../Data'
save_path = '../Preprocessed_Data'

In [18]:
train_df = pd.read_csv(save_path + '/df_train.csv')
test_df = pd.read_csv(save_path + '/df_test_seen.csv')
dev_df = pd.read_csv(save_path + '/df_dev_seen.csv')

In [19]:
train_text = train_df['text'].values
test_text = test_df['text'].values
dev_text = dev_df['text'].values

In [20]:
def remove_extra_whitespace(text):
    return ' '.join(text.split())

def lower_case(text):
    return text.lower()

In [21]:
train_cleaned = [remove_extra_whitespace(text) for text in train_text]
test_cleaned = [remove_extra_whitespace(text) for text in test_text]
dev_cleaned = [remove_extra_whitespace(text) for text in dev_text]

train_cleaned = [lower_case(text) for text in train_cleaned]
test_cleaned = [lower_case(text) for text in test_cleaned]
dev_cleaned = [lower_case(text) for text in dev_cleaned]

In [22]:
train_df['Cleaned_Text'] = train_cleaned
test_df['Cleaned_Text'] = test_cleaned
dev_df['Cleaned_Text'] = dev_cleaned

In [23]:
train_df

Unnamed: 0,id,img,label,text,img_exists,Cleaned_Text
0,42953,img/42953.png,0,its their character not their color that matters,True,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,True,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet,True,putting bows on your pet
3,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",True,"everybody loves chocolate chip cookies, even h..."
4,76932,img/76932.png,0,fine you're right. now can we fucking drop it?,True,fine you're right. now can we fucking drop it?
...,...,...,...,...,...,...
4187,50739,img/50739.png,1,whats the difference between a scout and a jew...,True,whats the difference between a scout and a jew...
4188,19372,img/19372.png,1,if you say these muslim protesters are violent...,True,if you say these muslim protesters are violent...
4189,10423,img/10423.png,1,nobody wants to hang auschwitz me,True,nobody wants to hang auschwitz me
4190,16492,img/16492.png,1,having a bad day? you could be a siamese twin ...,True,having a bad day? you could be a siamese twin ...


In [34]:
from transformers import AutoTokenizer, AutoModel
import torch
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_embeddings(sentences):
    sentences = list(sentences)
    # Tokenize the sentences
    tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state

    cls_embedding = embeddings[:, 0, :]
    return cls_embedding

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
train_embeddings = get_embeddings(train_df['Cleaned_Text'].values)
test_embeddings = get_embeddings(test_df['Cleaned_Text'].values)
dev_embeddings = get_embeddings(dev_df['Cleaned_Text'].values)

In [38]:
# Save the embeddings
with open(save_path + '/train_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(train_embeddings, f)

with open(save_path + '/test_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(test_embeddings, f)

with open(save_path + '/dev_bert_embeddings.pkl', 'wb') as f:
    pickle.dump(dev_embeddings, f)


In [40]:
# Save CSV files
train_df.to_csv(save_path + '/df_train_cleaned.csv', index=False)
test_df.to_csv(save_path + '/df_test_cleaned.csv', index=False)
dev_df.to_csv(save_path + '/df_dev_cleaned.csv', index=False)