In [None]:
# install all packages needed
!pip install httpx==0.27.2
!pip install transformers sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!cd /content/drive/Shareddrives/cs224w

Mounted at /content/drive


In [None]:
# download the model used for emotion labeling
!git clone https://huggingface.co/hplisiecki/word2affect_english

"""
@article{Plisiecki_Sobieszek_2023,
  title={Extrapolation of affective norms using transformer-based neural networks and its application to experimental stimuli selection},
  author={Plisiecki, Hubert and Sobieszek, Adam},
  journal={Behavior Research Methods},
  year={2023},
  pages={1-16}
  doi={https://doi.org/10.3758/s13428-023-02212-3}
}

"""

Cloning into 'word2affect_english'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 41 (delta 17), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (41/41), 324.63 KiB | 6.76 MiB/s, done.
Filtering content: 100% (2/2), 846.68 MiB | 67.90 MiB/s, done.


'\n@article{Plisiecki_Sobieszek_2023,\n  title={Extrapolation of affective norms using transformer-based neural networks and its application to experimental stimuli selection},\n  author={Plisiecki, Hubert and Sobieszek, Adam},\n  journal={Behavior Research Methods},\n  year={2023},\n  pages={1-16}\n  doi={https://doi.org/10.3758/s13428-023-02212-3}\n}\n\n'

In [None]:
import pandas as pd
from openai import OpenAI
import httpx
import torch
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy.stats as stats
from word2affect_english.model_script import CustomModel # importing the custom model class
from transformers import BertModel, BertTokenizer, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration
import math
import pickle
import os

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
dir_path = "/content/drive/Shareddrives/cs224w/"


In [None]:
# what does our starting data look like?

# load and make sure the dtypes are correct
vents_path = "/content/drive/Shareddrives/cs224w/vent_data_community.csv"
df = pd.read_csv(vents_path)
df['text_cleaned'] = df['text_cleaned'].astype(str)
df['reactions'] = df['reactions'].astype(int)
df['emotion_category'] = df['emotion_category'].astype(str)
df['emotion'] = df['emotion'].astype(str)

df.head()

Unnamed: 0,user_id,created_at,reactions,text_cleaned,emotion,emotion_category
0,f281696f-5be8-4b4c-bc44-056ebd6f4157,2018-05-11 07:20:36.284,1,"DYING is 5th on trending, it almost has 300 ...",Good,Dog Day
1,f281696f-5be8-4b4c-bc44-056ebd6f4157,2018-04-20 20:26:44.275,999,DYING [ ] [ YIKES ] [ FUCK ] [ HELP ] _The ...,Melting,Earth Day '18
2,c4792251-3bc7-4885-a3d0-41dd239225ca,2017-11-29 11:51:05.374,0,Wheres the suicidal option on here,Sad,Sadness
3,c4792251-3bc7-4885-a3d0-41dd239225ca,2017-10-07 22:29:01.744,0,I got stood up today... that makes this the th...,Disappointed,Sadness
4,c4792251-3bc7-4885-a3d0-41dd239225ca,2017-10-05 23:02:48.134,2,I hope I dodnt lose her doing something so fuc...,Ashamed,Sadness


In [None]:
# shape the data so we have one row for each user
grouped_df = df.groupby('user_id').agg({'text_cleaned': list,
                                        'reactions': list,
                                        'emotion_category': list,
                                        'emotion': list}).reset_index()

In [None]:
grouped_df.shape

(17107, 5)

In [None]:
def model_initializer(embedding_model = "bert-base-uncased"):
    """
    Avoids time to reinitialize model by initializing at start.
    Input: Embedding Model Name
    Output: T5 tokenizer, T5 model, Tokenizer for Embedding, Embedding Model, VAD model and tokenizer
    """
    # openai_client = OpenAI(api_key=openai_key)
    model_name = "t5-small"
    sum_tokenizer = T5Tokenizer.from_pretrained(model_name)
    sum_model = T5ForConditionalGeneration.from_pretrained(model_name)
    sum_model.to("cuda" if torch.cuda.is_available() else "cpu")

    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(embedding_model)
    embedding_model = BertModel.from_pretrained(embedding_model)

    vad_model_directory = "word2affect_english"

    vad_model = CustomModel.from_pretrained(vad_model_directory)
    vad_tokenizer = AutoTokenizer.from_pretrained(vad_model_directory)
    return sum_tokenizer, sum_model, tokenizer, embedding_model, vad_model, vad_tokenizer

def summarize_with_T5(posts, tokenizer, model):
    """
    Summarizes a list of posts using T5.
    Input: List of posts, max summary size, openai_client
    Output: Summary Text
    """
    #print(posts)
    #print(set([type(post) for post in posts]))
    combined_text = " ".join(posts)
    input_text = "summarize: " + combined_text
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True)
    input_ids = input_ids.to(model.device)

    summary_ids = model.generate(
        input_ids,
        max_length=100,
        num_beams=6,
        length_penalty=2.0,
        early_stopping=True
    )

    overall_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return overall_summary

def get_embedding(text, tokenizer, embedding_model):
    """
    Generate embedding for text given tokenizer and embedding model.
    Input: Initialized Tokenizer, Embedding model
    Output: Embedding
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = embedding_model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
# initialize all the LLMs we will be using for summarization, embedding, and emotion labeling
t5_tokenizer, t5_model, bert_tokenizer, bert_embedding_model, vad_model, vad_tokenizer = model_initializer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  state_dict = torch.load(f'{model_dir}/pytorch_model.bin', map_location=torch.device('cpu'))


In [None]:
import numpy as np

def test_get_embedding(token, embed):
    # Example text inputs
    text1 = "The cat sat on the mat."
    text2 = "The feline rested on the rug."
    text3 = "An unrelated sentence about cars."

    # Get embeddings
    embedding1 = get_embedding(text1, tokenizer=token, embedding_model=embed)
    embedding2 = get_embedding(text2, tokenizer=token, embedding_model=embed)
    embedding3 = get_embedding(text3, tokenizer=token, embedding_model=embed)

    # Basic checks: Ensure embeddings are arrays of the same length
    assert isinstance(embedding1, (list, np.ndarray)), "Embedding1 is not a list or numpy array."
    assert isinstance(embedding2, (list, np.ndarray)), "Embedding2 is not a list or numpy array."
    assert isinstance(embedding3, (list, np.ndarray)), "Embedding3 is not a list or numpy array."

    embedding1 = np.array(embedding1)
    embedding2 = np.array(embedding2)
    embedding3 = np.array(embedding3)

    # Check that embeddings have consistent dimensions
    assert embedding1.ndim == 1, "Embedding1 should be a 1D vector."
    assert embedding2.ndim == 1, "Embedding2 should be a 1D vector."
    assert embedding3.ndim == 1, "Embedding3 should be a 1D vector."

    # Check that all embeddings have the same size
    assert len(embedding1) == len(embedding2) == len(embedding3), "All embeddings should have the same dimension."

    # Check that values are numeric
    assert np.issubdtype(embedding1.dtype, np.number), "Embedding1 contains non-numeric values."
    assert np.issubdtype(embedding2.dtype, np.number), "Embedding2 contains non-numeric values."
    assert np.issubdtype(embedding3.dtype, np.number), "Embedding3 contains non-numeric values."

    # Optional: Check similarity. Text1 and text2 are more similar than text1 and text3.
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    sim_12 = cosine_similarity(embedding1, embedding2)
    sim_13 = cosine_similarity(embedding1, embedding3)

    print(f"Similarity(text1, text2): {sim_12}")
    print(f"Similarity(text1, text3): {sim_13}")

    # We expect text1 and text2 to be more similar than text1 and text3, or at least not drastically lower
    # This check depends on your embedding model. Adjust thresholds as necessary.
    # For example, we can at least assert that the similarity with a semantically similar text is not zero.
    assert sim_12 > 0.0, "Similarity between text1 and text2 should be greater than 0."
    # And maybe that text3 is not more similar than text2:
    assert sim_12 >= sim_13, "Text2 should be as or more similar to Text1 than Text3 is."

    print("All tests passed successfully!")

test_get_embedding(bert_tokenizer, bert_embedding_model)

Similarity(text1, text2): 0.8408461809158325
Similarity(text1, text3): 0.5120489597320557
All tests passed successfully!


In [None]:
def get_adj_char(text):
    """
    inspired: https://sourojitghosh.github.io/files/Emotions_Friendships_HCII_final.pdf
    Get ACC given text
    Input: text
    Output: Adjusted Char Count
    """
    # Tokenize sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Get char, caps counts
    char_count_unadj = len(text)
    cap_count = sum([1 for char in text if char.isupper()])

    # Adjust for proper nouns and I
    for i, word in enumerate(words):
        if word[0].isupper():
            pos_tag = nltk.pos_tag([word])[0][1]
            is_proper_noun = pos_tag in ["NNP", "NNPS"]
            if pos_tag in ["NNP", "NNPS"] or word == "I":
              cap_count -= 1

    # Remove for sentence starts not yet counted
    for sentence in sentences:
        first_word = word_tokenize(sentence)[0]
        if first_word[0].isupper() and not (nltk.pos_tag([first_word])[0][1] in ["NNP", "NNPS"] or first_word == "I"):
            cap_count -= 1

    return char_count_unadj + cap_count - text.count(" ")

def get_unique_words(text):
    """
    Count list of unique words
    """
    # Remove unnecessary chars
    words = text.lower().strip('.,!?;:"\'()[]').split()
    return len(set(words))

def get_syntax_ftrs(posts):
    """
    Given the user's posts, pull syntactical features.
    Input: list of posts
    Output: Avg Len, ACC, Unique words
    """

    syntax_ftrs = {}
    # Call helper functions to compute
    length_dist = [len(post) for post in posts]
    ACC_dist = [get_adj_char(post) for post in posts]
    uniq_dist = [get_unique_words(post) for post in posts]
    # Aggregate features
    syntax_ftrs = {'Len': round(sum(length_dist)/len(posts), 4),
                   'ACC': round(sum(ACC_dist)/len(posts), 4),
                   "Unique": round(sum(uniq_dist)/len(posts), 4)}
    return syntax_ftrs

In [None]:
def get_react_dist(reactions):
    """
    Given the user reaction counts, pull distribution features.
    Input: list of reaction counts
    Output: distribution descriptors
    """
    values = list(reactions)

    mean = np.mean(values)
    median = np.median(values)
    std_dev = np.std(values, ddof=1)
    iqr = stats.iqr(values)
    skewness = stats.skew(values)
    minimum = np.min(values)
    maximum = np.max(values)


    return {
        "Mean": mean,
        "Median": median,
        "Standard Deviation": std_dev,
        "Max": maximum,
        "Min": minimum,
        "Skew": skewness,
        "IQR": iqr
    }

In [None]:
def get_vads(tags, vad_map):
    """
    Given user posts, computes the average Valence, Arousal, and Dominance of tagged emotions.
    Input: List of string tags
    Output: Dict of avg for VAD
    """
    # collect over all tags
    avgs = np.mean([vad_map[tag] for tag in tags], axis=0)

    # Return averages
    return {'valence': avgs[0], 'arousal': avgs[1], 'dominance': avgs[2], 'concreteness': avgs[3]}


def get_vad(tag, vad_tokenizer, vad_model):
    inputs = vad_tokenizer(tag, return_tensors="pt")
    # Query model for VAD
    outputs = vad_model(inputs['input_ids'], inputs['attention_mask'])
    return [outputs[0].item(), outputs[1].item(), outputs[2].item(), outputs[4].item()]


In [None]:
# helper function for storing objects
def save_obj(obj, name, is_parq=True):
    if is_parq:
        with open(dir_path + name + '.parquet', 'wb') as f:
            obj.to_parquet(f)
    else:
        with open(dir_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f)

In [None]:
# use our loaded model to calculate the valence and arousal for each of the emotion categories in our dataset

vad_map = {emotion: get_vad(emotion, vad_tokenizer, vad_model) for emotion in list(df['emotion_category'].unique())}

In [None]:
# embed the emotion and emotion category using BERT

emotion_embs = {emotion: get_embedding(emotion, bert_tokenizer, bert_embedding_model) for emotion in list(df['emotion'].unique())}
emotion_cat_embs = {emotion: get_embedding(emotion, vad_tokenizer, bert_embedding_model) for emotion in list(df['emotion_category'].unique())}

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# save all the embeddings

emotion_embs_df = pd.DataFrame(list(emotion_embs.items()), columns=['emotion', 'embedding'])
emotion_cat_embs_df = pd.DataFrame(list(emotion_cat_embs.items()), columns=['emotion_cat', 'embedding'])
save_obj(emotion_embs_df, "emotion_embs")
save_obj(emotion_cat_embs_df, "emotion_cat_embs")

In [None]:
# we also want to get this data on a user level - calculate the reaction distribution and VAD data for each user and all the user's posts
grouped_df['vad'] = grouped_df['emotion_category'].apply(lambda tags: get_vads(tags, vad_map))
grouped_df['react_info'] = grouped_df['reactions'].apply(lambda reactions: get_react_dist(reactions))

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)
  skewness = stats.skew(values)


In [None]:
# loading in created summaries and syntax files populated by prior runs (in case a run fails in the middle) so we can run the rest of the batches

batch_size = 32
num_batches = math.ceil(len(grouped_df) / batch_size)

summary_path = "/content/drive/Shareddrives/cs224w/summaries.pkl"
syntax_path = "/content/drive/Shareddrives/cs224w/syntax.pkl"

with open(summary_path, 'rb') as file:
    summaries = pickle.load(file)
with open(syntax_path, 'rb') as file:
    syntaxes = pickle.load(file)

print(num_batches)
print(len(syntaxes), len(summaries))

535
16672 16672


In [None]:
# use T5 to summarize all the posts of a given user. this helps balance out the fact that some users may have many posts
# and some users may have none

for i in range(int(len(syntaxes)/batch_size), num_batches):
    batch = grouped_df['text_cleaned'].iloc[i*batch_size:(i+1)*batch_size].tolist()
    # Summarize all texts in this batch at once if possible
    batch_summaries = [summarize_with_T5(text, t5_tokenizer, t5_model) for text in batch]
    batch_syntax = [get_syntax_ftrs(text) for text in batch]

    summaries.extend(batch_summaries)
    syntaxes.extend(batch_syntax)

    if i % 20 == 0:
      with open(summary_path, 'wb') as file:
          pickle.dump(summaries, file)
      with open(syntax_path, 'wb') as file:
          pickle.dump(syntaxes, file)

    print(i)

521
522
523
524
525
526
527
528
529
530
531
532
533
534


In [None]:
print(grouped_df.shape)
print(len(summaries))
grouped_df['summary'] = summaries
grouped_df['syntax'] = syntaxes

(17107, 8)
17107


In [None]:
import os
batch_size = 32
num_batches = math.ceil(len(grouped_df) / batch_size)

# now that we have the sumamaries, embed them using bert

embed_path = "/content/drive/Shareddrives/cs224w/summary_embed.pkl"
if os.path.exists(embed_path):
  with open(embed_path, 'rb') as file:
      summary_embed = pickle.load(file)
else:
  summary_embed = []

print(num_batches)
for i in range(int(len(summary_embed)/batch_size), num_batches):
    batch = grouped_df['summary'].iloc[i*batch_size:(i+1)*batch_size].tolist()
    # Summarize all texts in this batch at once if possible
    # run BERT on the summary text
    batch_embeds = [get_embedding(text, bert_tokenizer, bert_embedding_model) for text in batch]
    summary_embed.extend(batch_embeds)

    if i % 20 == 0:
      with open(embed_path, 'wb') as file:
          pickle.dump(summary_embed, file)
    print(i)

In [None]:
print(grouped_df.shape)
print(len(summary_embed))
grouped_df['sum_emb'] = summary_embed

(17107, 8)
17107


In [None]:
# finally, save the dataframe with all the created features.
save_obj(grouped_df, "features_unravelled")