In [33]:
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
import csv
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer 
ps=PorterStemmer()

import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel
from collections import defaultdict
import json

In [34]:
def remove_stop_words(text: string):
    return ' '.join([item for item in text.split() if item not in stopwords.words('english')])

def remove_punctuations(text: string):
    return "".join([char for char in text if char not in string.punctuation])

def stem_text(text: string):
    return [ps.stem(word) for word in text]

def delete_usernames(text: str) -> str:
    pattern = r'@\w+'
    return re.sub(pattern, '@user', text)

In [35]:
before_corpus_list = []
after_corpus_list = []

tweets = {}

with open("Pre-processing.csv", 'r', newline='') as csvFile:
    reader = csv.reader(csvFile)
    next(reader) #skip header
    for line in reader:
        person = line[0].strip()
        tweets[person] = {}

        b_tweet = delete_usernames(remove_punctuations(remove_stop_words(line[3].strip())))
        a_tweet = delete_usernames(remove_punctuations(remove_stop_words(line[4].strip())))

        before_corpus_list.append(b_tweet)
        after_corpus_list.append(a_tweet)
        tweets[person]["Before"] = b_tweet.lower()
        tweets[person]["After"] = a_tweet.lower()


In [36]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_word_embeddings(sentence):
    # Tokenize the sentence
    encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Get the BERT embeddings
    with torch.no_grad():
        output = model(**encoded_input)
    
    # Get word-level embeddings
    token_embeddings = output.last_hidden_state[0].numpy()
    
    # Map tokens back to original words
    word_vectors = {}
    words = sentence.split()
    token_ids = encoded_input['input_ids'][0]
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    
    word_index = 0
    for i, token in enumerate(tokens):
        if token.startswith('##'):
            if word_index > 0:
                word_vectors[words[word_index-1]] = \
                    (word_vectors[words[word_index-1]] * (i-start) + token_embeddings[i]) / (i-start+1)
        elif token in ['[CLS]', '[SEP]', '[PAD]']:
            continue
        else:
            if word_index < len(words):
                word_vectors[words[word_index]] = token_embeddings[i]
                start = i
                word_index += 1
    
    return word_vectors

# Process tweets and generate word vectors
result = defaultdict(dict)
for person, person_tweets in tweets.items():
    for tweet_type in ["Before", "After"]:
        result[person][tweet_type] = get_word_embeddings(person_tweets[tweet_type])

# Print a sample of the result
for person in list(result.keys())[:2]:  # Print for first 2 persons
    print(f"\n{person}:")
    for tweet_type in ["Before", "After"]:
        print(f"  {tweet_type}:")
        for word, vector in list(result[person][tweet_type].items())[:2]:  # Print first 2 words
            print(f"    {word}: {vector[:5]}...")  # Print first 5 values of each vector
        if len(result[person][tweet_type]) > 2:
            print("    ...")


RealSpikeCohen:
  Before:
    just: [-0.6662592   0.01738216  0.49830064  0.04646187  0.696845  ]...
    month: [-0.64293134 -0.33376592  0.37095174 -0.09896091  0.30240285]...
    ...
  After:
    hamas: [-0.58247954  0.08837597  0.62763894 -0.31676415  0.17383316]...
    great: [ 0.28736863  0.07461813  0.68630224 -0.4492811  -0.11793958]...
    ...

ComicDaveSmith:
  Before:
    if: [-0.00075689  0.15637913  0.17640428 -0.15965573  0.06332569]...
    mission: [ 0.5582286  -0.18407351  0.16965123  0.10473238  0.34409454]...
    ...
  After:
    as: [-0.5240849   0.34253213  1.6846156   0.37956646  0.01245819]...
    israeli: [-0.27700266  0.26969013  0.09555568 -0.62538004  0.31419805]...
    ...


In [41]:
class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

dumped = json.dumps(result, cls=NumpyEncoder)

with open('Tokens.json', 'w') as fp:
    json.dump(dumped, fp)