In [1]:
from time import sleep
import csv
import json
import string
import re
import ast
from collections import defaultdict, Counter

import pandas as pd

import numpy as np
import math
import sklearn
from sklearn.model_selection import train_test_split

import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer 
ps=PorterStemmer()

# import matplotlib.pyplot as plt
# % matplotlib inline

In [2]:
# Load the stereotype dictionary
dictionary = "Create Dictionary/Stereotype_Dictionary.json"
with open(dictionary, 'r') as f:
    stereotype_dict = json.load(f)

stereotype_df = pd.DataFrame(list(stereotype_dict.items()), columns=['category', 'word'])
stereotype_df = stereotype_df.explode("word")
stereotype_df["word"] = stereotype_df["word"].str.lower()

stereotype_list = stereotype_df.word.tolist()

print(stereotype_list)

['unreliable', 'supernatural', 'uneconomical', 'flat', 'unconvinced', 'artless', 'retarded', 'nonprehensile', 'unreasonable', 'foolish', 'uncertain', 'ignorant', 'artificial', 'aft', 'sharp', 'cockamamie', 'unintelligent', 'inexperienced', 'backward', 'decertify', 'unentitled', 'ineligible', 'weak', 'bungling', 'incapable', 'unsusceptible', 'semiskilled', 'uninformed', 'unable', 'unnatural', 'imprudent', 'stale', 'awkward', 'humble', 'worry', 'impolitic', 'derestrict', 'dense', 'naive', 'bad', 'chancy', 'disqualify', 'cool', 'nontechnical', 'indeterminate', 'harebrained', 'nasty', 'brainless', 'clean', 'up_in_the_air', 'cold', 'unsealed', 'unhealthy', 'mild', 'indefinite', 'contingent', 'ineffective', 'asinine', 'doubtful', 'rattlebrained', 'dimmed', 'unskilled', 'inefficient', 'inadvisable', 'rugged', 'styleless', 'helpless', 'outright', 'ambivalent', 'ill-advised', 'unfashionable', 'speechless', 'maladroit', 'hopeless', 'salty', 'manual', 'anserine', 'unmodified', 'imprecise', 'uncon

In [3]:
def remove_stop_words(text: string):
    return ' '.join([item for item in text.split() if item not in stopwords.words('english')])

def remove_punctuations(text: string):
    return "".join([char for char in text if char not in string.punctuation])

def delete_usernames(text: str) -> str:
    pattern = r'@\w+'
    return re.sub(pattern, '@user', text)

def remove_non_words(text: string):
    return re.sub("[^a-zA-Z]",  # Search for all non-letters
                  " ",          # Replace all non-letters with spaces
                  str(text))    # Make sure the text input is in string format


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Function to split "After Corpus" into individual sentences and create a new DataFrame
def split_sentences(df, status: str):
    abbreviations = {
        "U.S.": "USA",
        "U.K.": "UK",
        "e.g.": "for example,",
        "i.e.": "such as,",
        "U.N.": "UN",
        "Gov.": "Governor",
        "etc.": "..."
    }

    # Initialize an empty list to store the rows of the new DataFrame
    rows = []

    # Iterate through each row in the original DataFrame
    for idx, row in df.iterrows():
        # Extract the relevant data
        subject_id = row['Subject_ID']
        tweet_id = row['Tweet_ID']
        corpus = row[f'{status}_Corpus']

        # Replace abbreviations with placeholders
        for abbr, placeholder in abbreviations.items():
            corpus = corpus.replace(abbr, placeholder)
        
        # Split the corpus into sentences
        sentences = re.split(r'(?<=[.!?])\s+', corpus)

        # Add each sentence as a new row in the list, keeping track of the tweet ID and subject ID
        for sentence in sentences:
            rows.append({'Subject_ID': subject_id, 'Tweet_ID': tweet_id, 'Sentence': sentence})

    # Create a new DataFrame from the list of rows
    new_df = pd.DataFrame(rows)
    return new_df

def get_segments_ids(df, subid):
    subjects = df[subid].unique()
    for sub in subjects:
        segments_ids = []
        for idx, row in df.iterrows():
            # Create a list of repeated IDs for each sentence
            segment_id_list = [idx] * len(row['Tokens'])
            segments_ids.append(segment_id_list)
        df["Segments_IDs"] = segments_ids
    return df["Segments_IDs"]




GETTING EMBEDDINGS AT THE SENTENCE LEVEL

In [4]:
def process_bert_sentences(sentences_df):
    """
    Process sentences through BERT after padding, with CUDA acceleration.
    """
    
    outputs = {} # dictionary to store all the outputs per subject
    
    # Get all unique subjects
    subjects = sentences_df['Subject_ID'].unique()

    # Check and set up CUDA device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    for subj in subjects:
        print(f"Subject: {subj}")
        # Subset df to just that subject
        df_subj = sentences_df[sentences_df["Subject_ID"] == subj]
        
        sentences = df_subj['Sentence'].tolist()

        # Initialize model and move to GPU
        model = BertModel.from_pretrained('bert-base-uncased',
                                          output_hidden_states=True,
                                          return_dict=True)
        model.to(device)
    
        # Evaluation mode
        model.eval()
        
        # Tokenize with padding and move to GPU
        inputs = tokenizer(sentences, 
                           padding='max_length', 
                           truncation=True, 
                           max_length=512, 
                           return_tensors="pt")
        
        # Move input tensors to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Process through BERT
        with torch.no_grad():
            # When return_dict=True, this returns a BaseModelOutputWithPoolingAndCrossAttentions object
            output = model(**inputs)
            
        # Handle the output based on its type
        if isinstance(output, dict):
            # If it's already a dictionary (with return_dict=True)
            processed_output = {k: v.cpu() if hasattr(v, 'cpu') else v for k, v in output.items()}
        else:
            # If it's a tuple (default BERT output)
            processed_output = {
                'last_hidden_state': output[0].cpu(),
                'pooler_output': output[1].cpu(),
                'hidden_states': tuple(h.cpu() for h in output[2]) if len(output) > 2 else None
            }
        
        outputs[subj] = processed_output
        
    return outputs

def tokenize_function(row, return_type, return_tensor=True):
    """
    Updated tokenization function with CUDA support
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    if return_tensor:
        tokenizer_results = tokenizer(row, truncation=True, return_tensors="pt")
        input_ids = tokenizer_results['input_ids'].to(device)
        token_type_ids = tokenizer_results['token_type_ids'].to(device)
        attention_mask = tokenizer_results['attention_mask'].to(device)
        
        if return_type == 'input_ids':
            return input_ids
        elif return_type == 'token_type_ids':
            return token_type_ids
        elif return_type == 'attention_mask':
            return attention_mask
        else:
            return 'return_type not recognized. Please enter "input_ids", "token_type_ids", or "attention_mask".'
    else:
        tokenizer_results = tokenizer(row, truncation=True)
        input_ids = tokenizer_results['input_ids']
        token_type_ids = tokenizer_results['token_type_ids']
        attention_mask = tokenizer_results['attention_mask']
        
        if return_type == 'input_ids':
            return input_ids
        elif return_type == 'token_type_ids':
            return token_type_ids
        elif return_type == 'attention_mask':
            return attention_mask
        else:
            return 'return_type not recognized. Please enter "input_ids", "token_type_ids", or "attention_mask".'


GETTING EMBEDDINGS AT THE WORD LEVEL

In [5]:
def process_bert_words(sentences_df, status: str, batch_size=32, accumulation_steps=4):
    """
    Process sentences through BERT to get word-level representations, with CUDA acceleration and memory-efficient processing.
    
    Parameters:
    - sentences_df: DataFrame containing sentences
    - status: String to identify the processing status
    - batch_size: Number of sentences to process in each batch
    - accumulation_steps: Number of batches to accumulate gradients (helps with memory management)
    """
    
    word_outputs = {}  # dictionary to store all the word-level outputs per subject
    
    # Get all unique subjects
    subjects = sentences_df['Subject_ID'].unique()

    # Check and set up CUDA device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    for subj in subjects:
        print(f"Processing Subject: {subj}")
        
        # Subset df to just that subject
        df_subj = sentences_df[sentences_df["Subject_ID"] == subj]
        sentences = df_subj['Sentence'].tolist()

        # Initialize model and move to GPU
        model = BertModel.from_pretrained('bert-base-uncased',
                                          output_hidden_states=True,
                                          return_dict=True)
        model.to(device)
        model.eval()  # Set to evaluation mode
        
        # Process sentences in batches
        for start in range(0, len(sentences), batch_size * accumulation_steps):
            # Select batch of sentences
            batch_sentences = sentences[start:start + batch_size * accumulation_steps]
            
            # Tokenize with padding and move to GPU
            inputs = tokenizer(batch_sentences, 
                               padding='max_length', 
                               truncation=True, 
                               max_length=512, 
                               return_tensors="pt")
            
            # Move input tensors to GPU
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Process through BERT
            with torch.no_grad():
                output = model(**inputs)
            
            # Extract word-level representations
            word_level_outputs = output.last_hidden_state
            
            # Iterate over each sentence and get the word vectors
            for i, sentence in enumerate(batch_sentences):
                word_tokens = tokenizer.tokenize(sentence)
                word_vectors = word_level_outputs[i, :len(word_tokens), :]
                
                # Store the word-level outputs per subject
                for j, word in enumerate(word_tokens):
                    if word in stereotype_list:
                        if subj not in word_outputs:
                            word_outputs[subj] = []
                        word_outputs[subj].append({
                            'subject_id': subj,
                            'word': word,
                            'vector': word_vectors[j].cpu().numpy()
                        })
            
            # Free up GPU memory
            torch.cuda.empty_cache()

    # Convert the dictionary to a pandas DataFrame and save it to a CSV file
    df = pd.DataFrame()
    for subj, words in word_outputs.items():
        df = pd.concat([df, pd.DataFrame(words)], ignore_index=True)
    df.to_csv(f"D:/BERT vectors/BERT_{status}_word_vectors.csv", index=False)

    return word_outputs

In [6]:
before = pd.read_csv(f"Cleaned Data/New_Before_NN_Cleaned.csv")
after = pd.read_csv(f"Cleaned Data/New_After_NN_Cleaned.csv")

before.head()


Unnamed: 0,Subject_ID,Affiliation,Followers,Tweet_ID,Affiliation_Contrast,Before_Dates,Before_Corpus,Before_Likes,Before_Retweets,Before_Replies,Before_Views
0,26,Democratic Party,1000000,5438,0.5,"October 13, 2022 02:51 AM",He makes me sick to my stomach. Hadn’t those f...,3521,405,69,42919
1,92,Democratic Party,1500000,3168,0.5,"September 21, 2023 12:54 PM","""There is no deal between the United States, B...",1420,459,90,122221
2,87,Republican Party,2700000,1502,-0.5,"May 19, 2023 05:39 AM",Biden gang walks back claim drone strike kille...,1873,575,75,115012
3,52,Democratic Party,437700,9781,0.5,"November 04, 2022 05:00 PM",I try not to invoke my Jewish identity because...,1442,271,23,27317
4,41,Republican Party,7000000,3261,-0.5,"July 24, 2023 02:18 PM",The current Israeli government had to pass the...,5228,892,827,1489217


In [7]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)




In [8]:
# Apply the function to split sentences
before_sentences = split_sentences(before, "Before")


# Apply all cleaning functions to the Before Corpus column
before_sentences["Sentence"] = (before_sentences["Sentence"]
                                .apply(delete_usernames)
                                .apply(remove_punctuations)
                                .apply(remove_stop_words)
                                .apply(remove_non_words))

# Apply sentence markers
before_sentences["Sentence"] = ("[CLS] "+ before_sentences["Sentence"] + " [SEP]")

# Show the first few rows of the resulting DataFrame
before_sentences.head()

Unnamed: 0,Subject_ID,Tweet_ID,Sentence
0,26,5438,[CLS] He makes sick stomach [SEP]
1,26,5438,[CLS] Hadn t families suffered enough [SEP]
2,26,5438,[CLS] What kind monster needs exacerbate pain ...
3,26,5438,[CLS] What parent planet stand someone like [SEP]
4,26,5438,[CLS] I don t get [SEP]


In [9]:
after_sentences = split_sentences(after, "After")

# Apply all cleaning functions to the After Corpus column
after_sentences["Sentence"] = (after_sentences["Sentence"]
                                .apply(delete_usernames)
                                .apply(remove_punctuations)
                                .apply(remove_stop_words)
                                .apply(remove_non_words))

# Apply sentence markers
after_sentences["Sentence"] = ("[CLS] "+ after_sentences["Sentence"] + " [SEP]")

# Show the first few rows of the resulting DataFrame
after_sentences.head()

Unnamed: 0,Subject_ID,Tweet_ID,Sentence
0,87,2038,[CLS] Today user user sued extremist leftist g...
1,87,2038,[CLS] Media Matters notorious trying suppress ...
2,87,2038,[CLS] After reading complaint I reminded NBC N...
3,87,2038,[CLS] COMPLAINT [SEP]
4,87,2038,[CLS] Defendant Media Matters America Media M...


In [10]:
# Tokenize our sentence with the BERT tokenizer.
before_sentences["Tokens"] = before_sentences["Sentence"].apply(tokenize_function, return_type = "input_ids", return_tensor = False)
before_sentences["Token_type"] = before_sentences["Sentence"].apply(tokenize_function, return_type = "token_type_ids", return_tensor = False)
before_sentences["Attention_mask"] = before_sentences["Sentence"].apply(tokenize_function, return_type = "attention_mask", return_tensor = False)

# Get Segment IDs based on the number of sentences, all tokens in the same sentence get the same ID.
before_sentences["Segments_IDs"] = get_segments_ids(before_sentences, "Subject_ID")

# before_sentences.head()

In [11]:
# Tokenize our sentence with the BERT tokenizer.
after_sentences["Tokens"] = after_sentences["Sentence"].apply(tokenize_function, return_type = "input_ids", return_tensor = False)
after_sentences["Token_type"] = after_sentences["Sentence"].apply(tokenize_function, return_type = "token_type_ids", return_tensor = False)
after_sentences["Attention_mask"] = after_sentences["Sentence"].apply(tokenize_function, return_type = "attention_mask", return_tensor = False)

# Get Segment IDs based on the number of sentences, all tokens in the same sentence get the same ID.
after_sentences["Segments_IDs"] = get_segments_ids(after_sentences, "Subject_ID")

# after_sentences.head()

In [12]:
# before_outputs = process_bert_words(before_sentences, "Before")
after_outputs = process_bert_words(after_sentences, "After")


Using device: cuda
Processing Subject: 87


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 79


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 26


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 75


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 94


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 63


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 41


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 28


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 17


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 52


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 16


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 20


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 39


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 93


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 95


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 83


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 74


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 23


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 78


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 92


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 32


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 71


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 60


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 30


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 48


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 77


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing Subject: 62


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# after_sentences.to_csv("Tokenized Data/After_Tokenized.csv", index=False)
# before_sentences.to_csv("Tokenized Data/Before_Tokenized.csv", index=False)

In [14]:
torch.cuda.empty_cache()

In [15]:
# def serialize_bert_outputs(outputs, max_subjects=1):
#     """
#     Convert BERT outputs to a JSON-serializable format
#     Limits to first few subjects to manage memory
#     """
#     serializable_outputs = {}
    
#     # Limit to first max_subjects
#     subject_keys = list(outputs.keys())[:max_subjects]
    
#     for subject in subject_keys:
#         output = outputs[subject]
#         serializable_subject = {}
        
#         for key, value in output.items():
#             try:
#                 if isinstance(value, torch.Tensor):
#                     # Move to CPU and convert to list
#                     # Add shape and dtype information for debugging
#                     print(f"Processing {key}: Tensor shape {value.shape}, dtype {value.dtype}")
#                     serializable_subject[key] = {
#                         'data': value.cpu().numpy().tolist(),
#                         'shape': list(value.shape),
#                         'dtype': str(value.dtype)
#                     }
#                 elif isinstance(value, tuple):
#                     # Handle tuple of tensors (like hidden_states)
#                     processed_tuple = []
#                     for t in value:
#                         if torch.is_tensor(t):
#                             print(f"Tuple item shape {t.shape}, dtype {t.dtype}")
#                             processed_tuple.append({
#                                 'data': t.cpu().numpy().tolist(),
#                                 'shape': list(t.shape),
#                                 'dtype': str(t.dtype)
#                             })
#                         else:
#                             processed_tuple.append(t)
#                     serializable_subject[key] = processed_tuple
#                 elif isinstance(value, (np.ndarray, torch.Tensor)):
#                     # Convert numpy array or remaining tensors to list
#                     serializable_subject[key] = value.cpu().numpy().tolist()
#                 elif isinstance(value, (np.int64, np.int32, int)):
#                     # Convert integers
#                     serializable_subject[key] = int(value)
#                 else:
#                     serializable_subject[key] = str(value)
#             except Exception as e:
#                 print(f"Error processing key {key}: {e}")
#                 serializable_subject[key] = str(value)
        
#         serializable_outputs[str(subject)] = serializable_subject
    
#     return serializable_outputs

In [16]:
# # Memory-efficient serialization
# try:
#     # Only process first subject
#     before_outputs_serializable = serialize_bert_outputs(before_outputs)
    
#     # Saving to JSON with reduced indentation to save memory
#     with open("Tokenized Data//BERT_Before_Tokenized.json", "w") as f:
#         json.dump(before_outputs_serializable, f, indent=2)
    
#     print("Serialization complete. Check the JSON file.")
    
# except Exception as e:
#     print(f"An error occurred during serialization: {e}")
#     # Print more detailed error information
#     import traceback
#     traceback.print_exc()