In [None]:
import json
import os
import pickle
import string

from functools import cache
from itertools import chain

import datasets
import guidance
import nltk
import spacy
import tqdm
import wandb

from datasets import load_dataset
from frozendict import frozendict
from guidance import assistant, gen, user
from guidance.models import OpenAI, Transformers
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm_notebook
from wandb import Artifact

### Download lexicon and setup OpenAI and spacy models.

In [None]:
os.environ['WANDB_API_KEY'] = "YOUR_KEY_HERE"
os.environ['OPENAI_API_KEY'] = "YOUR_KEY_HERE"
imdb_docs_filename = "imdb_docs.pkl"
wandb_project_name = "utility_reconstruction"
debug=False

In [None]:
run = wandb.init(project=wandb_project_name)

In [None]:
openai_model = OpenAI("gpt-4")

In [None]:
! python -m spacy download en_core_web_md
nltk.download('vader_lexicon')

In [None]:
triplets = []

intensity_analyzer = SentimentIntensityAnalyzer()
lexicon = intensity_analyzer.lexicon

nlp = spacy.load("en_core_web_md")

### Extract topmost positive words from lexicon.

In [None]:
def get_percentiles(dictionary, desired_percentile = 25):
    # Step 1: Extract values from the dictionary
    items = list(dictionary.items())

    # Step 2: Sort the values
    sorted_values = sorted(items, key = lambda x: x[1])

    # Step 3: Calculate indices for top 25% and bottom 25%
    total_count = len(sorted_values)

    percentile = desired_percentile / 100
    top_percentile = 1 - percentile
    top_index = int(top_percentile * total_count)

    bottom_index = int(percentile * total_count)

    # Step 4: Retrieve values at calculated indices
    positive_lexicon = frozendict(sorted_values[top_index:])
    negative_lexicon = frozendict(sorted_values[:bottom_index])

    return positive_lexicon, negative_lexicon

In [None]:
topmost_positive_lexicon, topmost_negative_lexicon = get_percentiles(lexicon)

In [None]:
topmost_positive_words = frozenset(topmost_positive_lexicon.keys())
topmost_negative_words = frozenset(topmost_negative_lexicon.keys())

all_positive_words = frozenset([key for key, value in lexicon.items() if value > 0])
all_negative_words = frozenset([key for key, value in lexicon.items() if value < 0])

### Compute spacy documents.

In [None]:
def strip_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))

In [None]:
def extract_top_sentences_and_tokens(spacy_doc, num_sentences=1):
    sents = list(spacy_doc.sents)[:num_sentences]
    tokens = list(chain(*sents))
    tokens = [strip_punctuation(str(token)).strip().lower() for token in tokens]

    sents = [str(sent) for sent in sents]
    return " ".join(sents), tokens

In [None]:
def compute_spacy_docs():
    imdb_dataset = load_dataset('imdb', split='train')
    imdb_texts = [item['text'] for item in imdb_dataset]
    all_imdb_docs = []
    for text in tqdm_notebook(imdb_texts):
        all_imdb_docs.append(nlp(text))
    return all_imdb_docs

def load_spacy_docs(filename: str):
    print(f'Loading spacy docs from {filename}')
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [None]:
def load_or_compute_spacy_docs(filename=imdb_docs_filename):
    try:
        all_imdb_docs = load_spacy_docs(imdb_docs_filename)
    except Exception as e:
        print(f'Caught exception {e}, recomputing spacy docs.')
        all_imdb_docs = compute_spacy_docs()

        with open(filename, 'wb') as file:
            pickle.dump(all_imdb_docs)
    return all_imdb_docs

In [None]:
all_imdb_docs = load_or_compute_spacy_docs(filename=imdb_docs_filename)

In [None]:
len(all_imdb_docs)

### Construct sentiment laden examples picking where the first sentence is positive only.

In [None]:
def get_sentiment_laden_examples(spacy_doc):
    """
    Gives score from vader lexicon of number of positive and negative words
    """
    sentences, tokens = extract_top_sentences_and_tokens(spacy_doc)
    target_sets = {}
    target_sets['positive'] = topmost_positive_words
    target_sets['negative'] = topmost_negative_words

    hit_tokens = {}
    score = {}
    for target in ['negative', 'positive']:
        hit_tokens[target] = frozenset(tokens).intersection(target_sets[target])
        score[target] = len(hit_tokens[target])
    
    return frozendict(score), frozendict(hit_tokens), sentences

In [None]:
spacy_doc = all_imdb_docs[78]
get_sentiment_laden_examples(spacy_doc)

In [None]:
all_score_tokens_and_sentences = [get_sentiment_laden_examples(spacy_doc) for spacy_doc in all_imdb_docs]

In [None]:
positive_examples = []
for score_tokens_and_sentences in all_score_tokens_and_sentences:
    score = score_tokens_and_sentences[0]
    positive = score['positive']
    negative = score['negative']
    if positive > 0 and negative == 0:
        positive_examples.append(score_tokens_and_sentences)

In [None]:
selected_positive_examples = positive_examples

In [None]:
selected_positive_examples[:2]

In [None]:
triplet = selected_positive_examples[0]
triplet

In [None]:
def get_result_from_guidance(prompt, openai_model):
    with user():
        lm = openai_model + prompt

    with assistant():
        lm += gen('answer')

    return lm['answer']

In [None]:
def select_prompt(triplet, openai_model, debug=False):
    text = triplet[2]
    positive_words = list(triplet[1]['positive'])

    prompt = f"""You are a helpful agent. Given a text and a collection C of positive words in the text.
    first I want you to find the sentiment of this text. Then replace each positive word with
    a negative sentiment word, such that the final sentiment becomes negative. Only alter the positive
    words, no other text should be changed. Give your response as parseable json strictly in the output format, 
    including the input sentiment, output sentiment , modified text and new words (as a dictionary of old to replaced words) :
    {{"input_sentiment": sentiment_value, "output_sentiment": new_sentiment_value, "modified_text": new_text, "new_words": new_words_dict}}.
    Sentiment values may be "positive" or "negative"
    Your input is: 
    {{"text": {text}, "positive_words": {str(positive_words)}}}
    """
    if debug:
        print(prompt)
    return prompt

In [None]:
@cache
def filter_and_modify_text(triplet):
    """
    Returns a contrastive pair of form "input_text", "output_text", "positive_words", "new_words".
    Where positive words are replaced with new words using ChatGPT.
    We only include pairs where sentiment is modified from positive to negative, and the json can be parsed successfully.
    """
    try:
        prompt = select_prompt(triplet, openai_model)
        input_text = triplet[2]
        positive_words = list(triplet[1]['positive'])

        result = json.loads(get_result_from_guidance(prompt, openai_model))

        output_text = None
        if result.get('input_sentiment', 'negative') == 'positive' and result.get('output_sentiment', 'positive') == 'negative':
            output_text = result.get('modified_text', None)
            new_words = result.get('new_words', None)

        if input_text and output_text and positive_words and new_words:
            return {
                "input_text": input_text,
                "output_text": output_text,
                "positive_words": positive_words,
                "new_words": new_words
            }
        else:
            return None
    
    except Exception as e:
        print(f'Error {e} processing triplet {triplet}. Returning none')
        return None

In [None]:
result = filter_and_modify_text(triplet)

In [None]:
print(f'Computing for full dataset of {len(positive_examples)}')

In [None]:
final_pairs = []
if debug:
    print(f'Restricting to 5 examples for debugging')
    selected_positive_examples = positive_examples[:5]
else:
    print(f'Computing for full dataset of {len(positive_examples)}')
    selected_positive_examples = positive_examples

for index, example in enumerate(tqdm_notebook(selected_positive_examples)):
    result = filter_and_modify_text(example)
    if index%5 == 0:
        # Progress tracker, essentially.
        wandb.log({"Index": index})
    if result:
        final_pairs.append(result)

print(f'Generated {len(final_pairs)} pairs from {len(selected_positive_examples)} inputs')

In [None]:
final_pairs[:3]

In [None]:
pairs_savefile = "final_triplets.json"
with open(pairs_savefile, "w") as f_out:
    json.dump(final_pairs, f_out)

In [None]:
with open(pairs_savefile, "r") as f_in:
    loaded_pairs = json.load(f_in)

In [None]:
def save_pairs_to_wandb(loaded_pairs):
    my_artifact = wandb.Artifact("contrastive_sentiment_pairs", type="data")
    
    # Add the list to the artifact
    my_artifact.add_file(local_path=pairs_savefile, name="contrastive_sentiment_pairs")

    metadata_dict = {
        "description": "Contrastive pairs from IMDB",
        "source": "Generated by my script",
        "num_pairs": len(loaded_pairs),
        "sources": len(selected_positive_examples),
        "split": "train"
    }

    my_artifact.metadata.update(metadata_dict)

    # Log the artifact to the run
    wandb.log_artifact(my_artifact)

save_pairs_to_wandb(loaded_pairs)