In [1]:
# Import the required modules
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the FLAN-T5 large tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")

# Load & Move the model to GPU
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl").to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

1. Load gold data
2. Structure gold data in random order for few shot
3. Few shot sentiment analysis with random order each time
4. Save results as they're processed (save as json)

In [7]:
import json
import random

In [8]:
# Load the real instances
with open('../data/all_windows.json', 'r') as f:
    all_instances = json.load(f)

In [9]:
# Load the gold data
with open('../models/gold_new.json', 'r') as f:
    gold_data = json.load(f)

In [10]:
# Structure the gold data in random order for few shot (every time)
training_instances = list(gold_data.values())

In [11]:
results = {}
i = 0

# Perform few-shot classification using each training set
for instance_id, window in all_instances.items():
    total_sentiments = {"positive": 0, "negative": 0, "neutral": 0}
    num_training_sets = 0

    # Randomly shuffle the gold data
    random.shuffle(training_instances)

    # Split the shuffled gold data into non-overlapping groups of 3 instances each
    training_sets = [training_instances[i:i + 3] for i in range(0, len(training_instances), 3)]

    for training_set in training_sets:
        num_training_sets += 1

        # Prepare the input string with training instances and the test instance
        input_str = ''
        for entry in training_set:
            input_str += 'Classify as positive, negative, or neutral ' \
                         'about climate policy: "' + str(entry['window']) + '"\n\n'

        input_str += 'Classify as positive, negative, or neutral about climate policy: ' \
                     '"{\'window\': \'' + window + '\', \'polarity\': '

        # Tokenize the input text
        # Move the input tensors to GPU
        input_ids = tokenizer(input_str, return_tensors="pt").input_ids.to('cuda')

        # Generate the prediction
        outputs = model.generate(input_ids, max_new_tokens=3, num_beams=3, num_return_sequences=3,
                                 return_dict_in_generate=True, output_scores=True)

        # Extract the token ids and logits
        output_token_ids = outputs.sequences
        output_logits = outputs.scores

        # Get the last logits for each return sequence
        last_logits = [logits[-1] for logits in output_logits]

        # Calculate the probability distribution for each last logit
        probs = [torch.softmax(logits, dim=-1).squeeze() for logits in last_logits]

        # Sentiments and their corresponding token ids
        sentiments = ["positive", "negative", "neutral"]
        sentiment_token_ids = [tokenizer.encode(sentiment)[0] for sentiment in sentiments]

        # Extract the probabilities for each sentiment from the probability distributions
        sentiment_probs = []
        for prob in probs:
            prob_values = []
            for token_id in sentiment_token_ids:
                try:
                    prob_values.append(prob[token_id].item())
                except IndexError:
                    prob_values.append(0)
            sentiment_probs.append(prob_values)

        # Calculate the average sentiment probabilities and update the sentiments dictionary
        sentiments = {sentiment: sum(sentiment_probs[i][j]
                      for i in range(len(sentiment_probs))) / len(sentiment_probs)
                      for j, sentiment in enumerate(sentiments)}

       # Update the total_sentiments with the current training set's sentiment probabilities
        for sentiment in sentiments.keys():
            total_sentiments[sentiment] += sentiments[sentiment]

    # Calculate the average sentiment probabilities for all training sets
    average_sentiments = {sentiment: total_sentiments[sentiment] / num_training_sets
                          for sentiment in total_sentiments.keys()}

    # Append the result to the results list
    results[instance_id] = {
        'window': window,
        'sentiment': average_sentiments
    }

    # Save the results as they are processed (write to JSON file)
    with open('../data/all_windows_classified.json', 'w') as output_file:
        json.dump(results, output_file, indent=4)

    # Print progress
    i += 1

    if i % 10 == 0:
        print(f"Processed {i} instances")

print("Sentiment analysis completed.")

Processed 10 iterations
Processed 20 iterations
Processed 30 iterations
Processed 40 iterations
Processed 50 iterations
Processed 60 iterations
Processed 70 iterations
Processed 80 iterations
Processed 90 iterations
Processed 100 iterations
Processed 110 iterations
Processed 120 iterations
Processed 130 iterations
Processed 140 iterations
Processed 150 iterations
Processed 160 iterations
Processed 170 iterations
Processed 180 iterations
Processed 190 iterations
Processed 200 iterations
Processed 210 iterations
Processed 220 iterations
Processed 230 iterations
Processed 240 iterations
Processed 250 iterations
Processed 260 iterations
Processed 270 iterations
Processed 280 iterations
Processed 290 iterations
Processed 300 iterations
Processed 310 iterations
Processed 320 iterations
Processed 330 iterations
Processed 340 iterations
Processed 350 iterations
Processed 360 iterations
Processed 370 iterations
Processed 380 iterations
Processed 390 iterations
Processed 400 iterations
Processed

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


Processed 780 iterations
Processed 790 iterations
Processed 800 iterations
Processed 810 iterations
Processed 820 iterations
Processed 830 iterations
Processed 840 iterations
Processed 850 iterations
Processed 860 iterations
Processed 870 iterations
Processed 880 iterations
Processed 890 iterations
Processed 900 iterations
Processed 910 iterations
Processed 920 iterations
Processed 930 iterations
Processed 940 iterations
Processed 950 iterations
Processed 960 iterations
Processed 970 iterations
Processed 980 iterations
Processed 990 iterations
Processed 1000 iterations
Processed 1010 iterations
Processed 1020 iterations
Processed 1030 iterations
Processed 1040 iterations
Processed 1050 iterations
Processed 1060 iterations
Processed 1070 iterations
Processed 1080 iterations
Processed 1090 iterations
Processed 1100 iterations
Processed 1110 iterations
Processed 1120 iterations
Processed 1130 iterations
Processed 1140 iterations
Processed 1150 iterations
Processed 1160 iterations
Processe