# Experiments for the paper: "Herding Llamas: Towards Automated Review Title Generation"

- Please ensure you have all dependencies properly configured before attempting to run this notebook.

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import gc
import os

from rouge import Rouge
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from IPython.display import clear_output
import language_tool_python

import torch
from torch import bfloat16
import transformers
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from datasets import load_dataset
import tensorflow_hub as hub

tool = language_tool_python.LanguageTool('en-US')
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)
rouge = Rouge()
analyzer = SentimentIntensityAnalyzer()

from utils import process, filtering, filter_by_ratings, duplicate, load_prompts, metrics, MMD, rdiv

to_remove = ["&#34", "&quot", "<br />", "*", "/", "@", '\\', "#", "%", "^", "&", "~", "'", '"', '-', '—', '(', ')']
punkt = ['.', '?', ';', ':', '!', ',']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

##################################
# cat = 'Automotive'
# cat = 'Health_and_Household'
# cat = 'Office_Products'
cat = 'Arts_Crafts_and_Sewing'
oos_cat = 'Arts_Crafts_and_Sewing'
##################################

try: os.makedirs(cat)
except FileExistsError: pass

# Import all needed Amazon reviews data:

In [None]:
### WHEN FIRST OPENING THE DATASET ###
fivecore = pd.read_csv(os.path.join(cat, cat+'.csv.gz'))
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_"+cat, split="full", trust_remote_code=True)
metadata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_"+cat, split="full", trust_remote_code=True)
dataset = dataset.sort("parent_asin")

dataset_asins = dataset["parent_asin"]
meta_asins = metadata["parent_asin"]

_, _, idx = np.intersect1d(dataset_asins, meta_asins, return_indices=True)
metadata = metadata.select(idx)
meta_asins = metadata["parent_asin"]
names = metadata["title"]
categories = np.array([', '.join(x) for x in metadata["categories"]])
categories[categories == ''] = cat

names, categories = duplicate(dataset_asins, meta_asins, names, categories)
meta = pd.DataFrame({'title': names, 'categories': categories})
meta.to_csv(os.path.join(cat, cat+'_meta.csv.gz'), compression='gzip')

In [2]:
### AFTER PROCESSED DATA SAVED ###
print("Loading main data.")
fivecore = pd.read_csv(os.path.join(cat, cat+'.csv.gz'))
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_"+cat, split="full", trust_remote_code=True)
metadata = pd.read_csv(os.path.join(cat, cat+'_meta.csv.gz'))
dataset = dataset.sort("parent_asin")

print("Getting needed columns.")
titles = dataset['title']
reviews = dataset['text']
ratings = dataset['rating']
times = dataset['timestamp']
names = metadata['title']
names = np.where(pd.isna(names), cat+' Product', names)
categories = metadata['categories']

Loading main data.
Getting needed columns.


# Filter the dataset based on proposed heuristics (see utils.py):

In [3]:
# Special case: mismatched reviews need to be removed due to error in dataset
exclude = [
    "Meguiar's G191501 Ultimate Snow Foam Wash, Pink Foaming Car Wash Soap for Foam Cannons & Foam Guns, Ideal Foam Wash for Cars, Trucks, Motorcycles, RVs & More - 1 Gallon Container",
    "Meguiar's Hybrid Wash Mitt, Dual Sided for Washing and Waxing, Clear Coat Safe and Reusable - 1 Mitt"
]

start = 0
end = None
final = filtering(
    titles[start:end], 
    reviews[start:end], 
    ratings[start:end], 
    times[start:end], 
    names[start:end],
    categories[start:end],
    exclude=exclude
)

Converted data point 8966758/8966758. Number of valid review-title pairs: 105244


In [4]:
final_df = pd.DataFrame(final, columns=['title', 'text', 'rating', 'timestamp', 'name', 'category'])
final_df.to_csv(os.path.join(cat, 'final.csv'), index=False)  

# Process the filtered data in various ways:

In [None]:
# Remove duplicates
final_new = pd.read_csv(os.path.join(cat, 'final.csv'))

final_new.drop_duplicates(subset=['title'], keep='last', inplace=True)
final_new.drop_duplicates(subset=['text'], keep='last', inplace=True)

titles_new = final_new['title']
reviews_new = final_new['text']
ratings_new = final_new['rating']
times_new = final_new['timestamp']

final_new

In [3]:
# Fix grammar, spelling, capitalization, and remove extra punctuation from titles.
ts = []
L = len(titles_new)
for idx, (t, r) in enumerate(zip(titles_new, reviews_new)):
    clear_output(wait=True)
    print('Fixing item {}/{}'.format(idx+1, L))
    if t.isupper():
        t = t.lower()
    t = tool.correct(t)
    if t[-1] not in ['.', '?','!']:
        t += '.'
    extras = ['....', '??','!!','((','))']
    replace = ['...', '?', '!', '(', ')']
    for _ in range(10):
        for _, (e, r) in enumerate(zip(extras, replace)):
            t = t.replace(e, r)
    ts.append(t)

Fixing item 103221/103221


In [None]:
final_new['title']=ts
final_new

In [5]:
# Clean up the reviews a bit
rs = []
L = len(titles_new)
for idx, (t, r) in enumerate(zip(titles_new, reviews_new)):
    if idx%100 == 0 or L-idx < 10:
        clear_output(wait=True)
        print('Fixing item {}/{}'.format(idx+1, L))
    rs.append(process(r, ["&#34", "&quot", "<br />"], lower=False))

Fixing item 103221/103221


In [None]:
final_new['text']=rs
final_new

In [None]:
# Remove product reviews if the same product shows up over 5 times in the data.
n = ''
c = 1
idxs = []

final_new_idx = final_new.reset_index(drop=True)

for idx, row in final_new_idx.iterrows():
    if n == row['name']:
        c += 1
    else:
        c = 1
        n = row['name']
    if c <= 5:
        idxs.append(idx)
        
final_new_idx = final_new_idx.iloc[idxs].reset_index(drop=True)
final_new_idx

In [8]:
# Create train/validation/test splits

final_new_idx.to_csv(os.path.join(cat, 'final_proc.csv'), index=False)
final_proc = pd.read_csv(os.path.join(cat, 'final_proc.csv'))

final_train, test_init = train_test_split(final_proc, test_size=0.2)
final_valid, final_test = train_test_split(test_init, test_size=0.5)
final_train.to_csv(os.path.join(cat, 'final_train.csv'), index=False)
final_valid.to_csv(os.path.join(cat, 'final_valid.csv'), index=False)
final_test.to_csv(os.path.join(cat, 'final_test.csv'), index=False)

In [9]:
train = pd.read_csv(os.path.join(cat, 'final_train.csv'))
valid = pd.read_csv(os.path.join(cat, 'final_valid.csv'))
test = pd.read_csv(os.path.join(cat, 'final_test.csv'))

print(len(train), len(valid), len(test))
# Automotive: 20202 2525 2526
# Health_and_Household: 21328 2666 2666
# Office_Products: 22759 2845 2845
# Arts_Crafts_and_Sewing: 71103 8888 8888

71103 8888 8888


In [10]:
# Filter the data one last time to get even numbers of user ratings (1-5)

train_even = filter_by_ratings(train)
valid_even = filter_by_ratings(valid)
test_even = filter_by_ratings(test)

train_even.to_csv(os.path.join(cat, 'train_even.csv'), index=False)
valid_even.to_csv(os.path.join(cat, 'valid_even.csv'), index=False)
test_even.to_csv(os.path.join(cat, 'test_even.csv'), index=False)

print(len(train_even), len(valid_even), len(test_even))
# Automotive: 6645 720 745 (2M)
# Health_and_Household: 7060 815 860 (3M)
# Office_Products: 9590 1095 1195 (3M)
# Arts_Crafts_and_Sewing: 25145 3035 3125 (all)

25145 3035 3125


In [11]:
# Combine the individual categories into a single, shuffled dataset with even distribution per rating and per category.

cats = ['Automotive', 'Health_and_Household', 'Office_Products']
amounts = [1000, 140, 140] # 5000 train per category, 700 valid and test per category --> 15000/2100/2100 total
names = ['train', 'valid', 'test']

for _, (a, n) in enumerate(zip(amounts, names)):
    new = []
    for c in cats:
        current = pd.read_csv(os.path.join(c, n+'_even.csv')).sort_values(by=['rating'])
        cols = current.columns
        L = len(current)//5
        idxs = np.ravel([np.arange(c[0], c[1]) for c in [[L*b,L*b+a] for b in range(5)]])
        new.append(current.iloc[idxs,:])
    new = pd.DataFrame(np.array(new).reshape(a*15, -1), columns=cols).sample(frac=1).reset_index(drop=True)
    new.to_csv(n+'_combined.csv', index=False)  

new       

Unnamed: 0,title,text,rating,timestamp,name,category
0,"Not what I expected, but awesome smell.",Was not the product I ordered. Smells divine. ...,4.0,1572612599974,"The Original CJ's BuTTer (Monkey Farts, 12 oz....","Health & Household, Health Care, Over-the-Coun..."
1,Bought as a replacement battery for APS backup.,Read another reviewer's suggestion as to what ...,4.0,1355167261000,Power-Sonic PS-1290 12 Volt 9 Amp Hour Recharg...,"Automotive, Replacement Parts, Batteries & Acc..."
2,Does not fit a 2003 1500 at all.,I have a 2003 Dodge Ram 1500 with stock power+...,1.0,1646467706564,Fit System K Source 80700 Towing Mirror Ram 15...,"Automotive, Exterior Accessories, Towing Produ..."
3,"Three months--it's ok, but look at the price!",My father purchased this and I'm his tech gal....,3.0,1506447020000,Canon Office Products IP7220 Wireless Color Ph...,"Office Products, Office Electronics, Printers ..."
4,Great polish for various metals. Effective an...,I have used this on a variety of metals with g...,5.0,1655410569628,"Chemical Guys SPI_404_16 Light Metal Polish, 1...","Automotive, Car Care, Exterior Care, Car Polis..."
...,...,...,...,...,...,...
2095,Not a Good Label Maker at All!,I always loved making labels like the ones you...,1.0,1551814680809,K&CompanySMASH Label Maker,"Office Products, Office Electronics, Other Off..."
2096,Missing rubber mat & cracked in 1 day?,01/14/2022 The product looks great but it’s mi...,2.0,1642197595801,MECHCOS Compatible with 2021 2020 Hyundai Pali...,"Automotive, Replacement Parts, Body & Trim, Tr..."
2097,Does NOT FIT Briggs and Stratton 5 gallon plas...,Extremely disappointed when I opened the spout...,1.0,1675712510188,CM Concepts U.S Gas/Water Can Long Angled Spou...,"Automotive, Motorcycle & Powersports, Parts, F..."
2098,Does not work with 2020 4RUNNER.,"according to AMAZON, this fir my 2020 $RUNNER....",2.0,1660446535093,"Rain-X 850012 R16B Expert Fit Rear Blade, (Pac...","Automotive, Replacement Parts, Windshield Wipe..."


# Load in the model, pipeline, and dataset for training OR evaluation:

In [4]:
##############################################################################################################
######################################### USER INPUT ARGS ####################################################
##############################################################################################################
# model_name = 'meta-llama/Llama-2-7b-chat-hf'
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

base = True # Set to True when fine-tuning OR evaluating a base model
evaluate = False # Set to False when fine-tuning
use_combined_data = True # Set to True when fine-tuning
greedy_sampling = False
##############################################################################################################
##############################################################################################################
##############################################################################################################

output_dir = './saves/llama'+('2' if '2' in model_name else '3')
if base and evaluate: output_dir += '-base'
if use_combined_data and evaluate: output_dir += '-combined'
try: os.makedirs(output_dir)
except FileExistsError: pass
model_dir = os.path.join(output_dir.split('-')[0], 'final')

if 'pipeline' in globals(): del pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=(model_name if base else model_dir),
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    return_full_text=False
)

prompts_train, prompts_valid, prompts_test = load_prompts(pipeline, (None if use_combined_data else cat))

if not evaluate:
    # Max sequence length in the dataset (1 token ~ 4 chars)
    max_seq = np.max([len(r)//4 for r in prompts_train['text']])
    print("Max sequence length:", max_seq)

    print('\n', prompts_train['text'][0], '\n')
    print(prompts_test['text'][0])

if (not evaluate) or (not base):
    del pipeline

# Clear up GPU memory for optimal training and evaluation
if 'model' in globals(): del model, tokenizer, bnb, config
gc.collect()
torch.cuda.empty_cache()

if base:
    config = AutoConfig.from_pretrained(model_name, token=True)
    bnb = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = 'nf4',
        bnb_4bit_use_double_quant = True,
        bnb_4bit_compute_dtype = bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code = True,
        config = config,
        quantization_config = bnb,
        device_map = 'auto',
        token = True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
else:
    config = None
    bnb = None
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        torch_dtype=torch.float16
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

model.eval()
tokenizer.pad_token = "[PAD]"
tokenizer.pad_token_id = 0

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Max sequence length: 515

 <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Generate the best succinct title for the following product review. Your only output should be the title itself. Do not mention the user rating in the title. Product rating: 1/5 stars. Product categories: 'Automotive, Interior Accessories, Floor Mats & Cargo Liners, Floor Mats'.<|eot_id|><|start_header_id|>user<|end_header_id|>

These are super flimsy and the mats slip and roll around on the floor, can be pretty dangerous when the slip and fold by the pedals. Avoid buying these. Waste of money. You're better off without any mats than having these.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"These mats slip, fold, bunch, and roll around your car floor. AVOID."<|eot_id|><|start_header_id|>assistant<|end_header_id|>

 

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Generate the best succinct title for the following product review. Your only output should be the title itself. 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Train the selected model:

In [None]:
# TRAIN THE MODEL

# Use SFT: Supervised Fine-Tuning
# https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama2/7B_qlora_single_device.yaml
# https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3/8B_qlora_single_device.yaml

peft_config = LoraConfig(
    lora_alpha = 16, # 16: The alpha parameter for Lora scaling.
    lora_dropout = 0.05, # 0.05:  The dropout probability for Lora layers.
    r = 8, # 8: Lora attention dimension (the “rank”).
    task_type = "CAUSAL_LM"
)

args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size = 2, # 2
    gradient_accumulation_steps = 16, # 16,
    optim = 'adamw_torch',
    weight_decay = 0.01,
    save_steps = 100,
    logging_steps = 1,
    # evaluation_strategy = 'steps',
    # eval_steps = 50
    learning_rate = 3e-4,
    num_train_epochs = 1,
    warmup_steps = 100
)

trainer = SFTTrainer(
    model = model,
    train_dataset = prompts_train,
    eval_dataset = prompts_valid,
    peft_config = peft_config,
    dataset_text_field = 'text',
    max_seq_length = 1024,
    tokenizer = tokenizer,
    args = args,
)

model.train()
trainer.train()
trainer.save_model(model_dir)

# Evaluate the selected model:

In [3]:
# EVALUATE THE MODEL
out = []
L = len(prompts_test['text'])

for idx, prompt in enumerate(prompts_test['text']):
    clear_output(wait=True)
    print("Evaluating sample {}/{}".format(idx+1, L))
    if base:
        if greedy_sampling:
            final_outputs = pipeline(prompt, max_new_tokens=30, do_sample=True, top_k=1)
        else:
            final_outputs = pipeline(prompt, max_new_tokens=30, do_sample=True, temperature=1.0, top_p=0.9)
        final_outputs = final_outputs[0]['generated_text'].strip()
    else:
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
        if greedy_sampling:
            outputs = model.generate(inputs, max_new_tokens=30, do_sample=True, top_k=1)
        else:
            outputs = model.generate(inputs, max_new_tokens=30, do_sample=True, temperature=1.0, top_p=0.9)
        final_outputs = tokenizer.decode(outputs[0])
        if '2' in model_dir:
            final_outputs = final_outputs.split('[/INST] ')[1].split(' <')[0]
        elif '3' in model_dir:
            final_outputs = final_outputs.split('t<|end_header_id|>\n\n')[1].split('<')[0]
    if final_outputs[0] == '"': final_outputs = final_outputs[1:]
    if final_outputs[-1] == '"': final_outputs = final_outputs[:-1]
    out.append(final_outputs)
    
if greedy_sampling: 
    np.save(os.path.join(output_dir, 'outputs_greedy.npy'), out)
else: 
    np.save(os.path.join(output_dir, 'outputs_sampled.npy'), out)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Evaluating sample 3125/3125


# Observe generated titles:

In [8]:
titles = np.load('./saves/llama2/outputs_greedy.npy')
for i, s in enumerate(titles):
    print(i, s)

0 Works well, but not for heavy use.
1 Great colors, but hard to get the right lettering thickness.
2 Great for vinyl crafts and small details.
3 Great for 1:18 scale diorama.
4 Great replica of the Viper from the original BSG series.
5 Does nothing to reduce creaking in wood floors.
6 Not what I expected. The tape measure doesn't use black ink.
7 Great for speedweve and quick work.
8 Great for nail art and face gems.
9 Great for sketching and practice.
10 Sharpening is required, but it works well.
11 Good price for a lot of rivets.
12 Great product for sewing fabric bags.
13 Perfect for organizing jewelry making tools!
14 Great product for floral arrangements.
15 Great idea, but the paper is weird.
16 Great for stencils and hand cutting designs.
17 Arrived in a single plastic bag.
18 Not a lot of variety and too small.
19 Cute and functional for kids.
20 Great binder, but the envelopes are a little hard to use.
21 If you use metric, this is a great mat.
22 Cute, but a few changes coul

In [4]:
titles = np.load('./saves/llama3/outputs_greedy.npy')
for i, s in enumerate(titles):
    print(i, s)

0 Works well for toner transfer.
1 Great colors, but hard to get the right lettering thickness.
2 Great for making small details on vinyl crafts.
3 Great for 1:18 scale garage door.
4 Great Vipers, but wish they had a figure and cockpit that opened.
5 Did nothing to reduce creaks in wood floor.
6 Can't see the numbers on the blue ones.
7 Great for speedweve and carpet.
8 Great value for the money, very pretty and easy to use.
9 Great for the price, but not as black as other brands.
10 Good tool, but needs sharpening.
11 Good price for a lot of rivets.
12 Great product for sewing bags.
13 Perfect for jewelry making tools.
14 Great product for floral arrangements.
15 Okay book, but the paper is weird.
16 Awesome for stencils and cutting designs.
17 Good quality needles in a bag.
18 Too small and not deep enough for epoxy resin.
19 Great little case for a 4-year-old.
20 Great Binder, but envelopes should have a zipper or slot closure.
21 Great mat, but only for metric quilters.
22 Great p

In [16]:
titles = np.load('./saves/llama2/outputs_sampled.npy')
for i, s in enumerate(titles):
    print(i, s)

0 Works great for what I need it for, I just wish it was a bit sturdier.
1 The letters are bold and colorful.
2 Great for Vinyl Crafting or Art!
3 Great scale for 1/18th diorama roll up door.
4 This is the BSG Viper you want if you are a fan of the series.
5 This is powdered graphite, not a solution for creaking floors.
6 I love the way these look, the problem with the product is that the numbers are not in black.
7 Love this product, makes the weaving so much quicker.
8 Great value for the price. I've already worn them several times.
9 A good set of fineliner pens for sketching.
10 This worked for me, however I had to do the work.
11 Great for rivet guns.
12 Strong thread, great price.
13 A perfect organizational tool for jewelry makers.
14 These are wonderful to work with.
15 Decent book, just not my style.
16 Awesome for stencils or hand cut designs!
17 Good quality, single pack in plastic bag.
18 Not good for resin - spills out of the small bezel holes.
19 Great for 4-year-old knic

In [7]:
titles = np.load('./saves/llama3-base/outputs_sampled.npy')
for i, s in enumerate(titles):
    print(i, s)

0 Reliable Brayer for Toner Transfer and PCB Making, Despite Occasional Centering Issues
1 Artistic Balance: Pros and Cons of a Double-Tipped Marker
2 Perfect for Vinyl Crafts: Highly Versatile Acrylic Paint Markers
3 Perfectly Scaled Decorative Paper for 1:18 Diorama
4 Authentic BSG Vipers that Hit the Mark
5 Disappointing Graphite Powder Performance: More Damage Than Relief
6 Disappointing Ink Color on Some Tape Measures
7 Effortless Knitting: Speedweve Needles Revolutionize Craftsmanship
8 Glamorous Gems for a Show-Stopping Look
9 Pens That Deliver on Promise: A Great Value for Sketchers and Practitioners
10 Sharpening Required: A Low-Cost Gamble for Leathercraft Enthusiasts
11 Rivet Kit with Storage Case: A Convenient and Reasonable Solution
12 Surprisingly Strong and Versatile Beading Cord
13 Organized Bliss: Jewelry Making Tool Storage Case
14 Floral Foam Blocks That Deliver: Easy to Work With and Long-Lasting
15 Fold-Out Fun, But Patterns and Paper Leave Room for Improvement
16 

# Calculate Metrics:

In [2]:
# Calculate the first set of metrics (see utils.py)

for L in ['llama2', 'llama3']:
    for S in ['greedy', 'sampled']:
        for B in ['', '-base']:
            for C in ['']:
            # for C in ['', '-combined']:
                d = L + B + C
                print("Computing metrics for", d, S)
                if C == '': test = pd.read_csv(os.path.join(oos_cat, 'test_even.csv'))
                else: test = pd.read_csv('test_combined.csv')
                gen_titles = np.load('./saves/' + d + '/outputs_{}.npy'.format(S))
                ms = metrics(gen_titles, test['text'], test['rating'])
                np.save('./saves/' + d + '/metrics_{}.npy'.format(S), ms)

Computing metrics for llama2 greedy
Computing metrics for llama2-base greedy
Computing metrics for llama2 sampled
Computing metrics for llama2-base sampled
Computing metrics for llama3 greedy
Computing metrics for llama3-base greedy
Computing metrics for llama3 sampled
Computing metrics for llama3-base sampled


In [4]:
# Display the first set of calculated metrics

for S in ['greedy', 'sampled']:
    for L in ['llama2', 'llama3']:
        for B in ['-base', '']:
            for C in ['']:
            # for C in ['', '-combined']:
                d = L + B + C
                if C == '': test = pd.read_csv(os.path.join(oos_cat, 'test_even.csv'))
                else: test = pd.read_csv('test_combined.csv')
                reviews = test['text']
                ratings = test['rating']
                gen_titles = np.load('./saves/' + d + '/outputs_{}.npy'.format(S))
                ms = np.load('./saves/' + d + '/metrics_{}.npy'.format(S))
                length = len(ratings)

                print('\n' + d + ' ' + S)
                l = ms[:, 0]
                print("% Brevity:", 100*(1-(len(l[l<15])+len(l[l>80]))/length))
                l = ms[:, 2]
                print("% Uniqueness:", 100*len(l[l==0])/length)

                count = 0
                for _, (pred_sentiment, real_sentiment) in enumerate(zip(ms[:, 7], ms[:, 8])):
                    if (pred_sentiment == 0.0 and np.abs(real_sentiment) == 1.0) \
                    or (pred_sentiment < -0.01 and real_sentiment > 0.01) \
                    or (pred_sentiment > 0.01 and real_sentiment < -0.01):
                        count += 1
                print("% Sentiment:", 100*(1-count/length))

                print("Average maximum Rouge-1 recall:", np.mean(ms[:, 3]))
                print("Average maximum Rouge-2 recall:", np.mean(ms[:, 4]))
                print("Average maximum Rouge-L recall:", np.mean(ms[:, 5]))
                print("Average maximum similarity:", np.mean(ms[:, 6]))

                l = ms[:, 9]
                print("% titles with profanity:", 100*len(l[l==1])/length)
                l = ms[:, 10]
                print("% titles with > 10% special characters:", 100*len(l[l>0.1])/length)
                l = ms[:, 11]
                print("% titles with emojis (unwanted):", 100*len(l[l>0])/length)
                l = ms[:, 12]
                print("% titles with all uppercase:", 100*len(l[l==1])/length)


llama2-base greedy
% Brevity: 93.184
% Uniqueness: 90.56
% Sentiment: 89.664
Average maximum Rouge-1 recall: 0.2625664252767306
Average maximum Rouge-2 recall: 0.0864425082398149
Average maximum Rouge-L recall: 0.22694859647095786
Average maximum similarity: 0.4601032645234466
% titles with profanity: 0.576
% titles with > 10% special characters: 0.0
% titles with emojis (unwanted): 0.0
% titles with all uppercase: 0.0

llama2 greedy
% Brevity: 99.776
% Uniqueness: 62.304
% Sentiment: 86.048
Average maximum Rouge-1 recall: 0.3498470593433228
Average maximum Rouge-2 recall: 0.1895903822959325
Average maximum Rouge-L recall: 0.3224055083916437
Average maximum similarity: 0.536839875767231
% titles with profanity: 0.512
% titles with > 10% special characters: 0.64
% titles with emojis (unwanted): 0.16
% titles with all uppercase: 0.064

llama3-base greedy
% Brevity: 98.592
% Uniqueness: 92.512
% Sentiment: 87.64800000000001
Average maximum Rouge-1 recall: 0.21924827258476812
Average maxi

In [5]:
# Calculate and display MMD and R-Div

for S in ['greedy', 'sampled']:
    for L in ['llama2', 'llama3']:
        for B in ['-base', '']:
            for C in ['']:
            # for C in ['', '-combined']:
                d = L + B + C
                if C == '': test = pd.read_csv(os.path.join(oos_cat, 'test_even.csv'))
                else: test = pd.read_csv('test_combined.csv')
                orig_titles = test['title']
                ratings = test['rating']
                titles = np.load('./saves/' + d + '/outputs_{}.npy'.format(S))
                print('\n' + d + ' ' + S)
                
                embs = []
                orig_embs = []
                for i, (t, ot, rat) in enumerate(zip(titles, orig_titles, ratings)):
                    t = process(t, to_remove+punkt)
                    ot = process(ot, to_remove+punkt)
                    embs.append(embed([t]))
                    orig_embs.append(embed([ot]))

                embs = np.squeeze(np.array(embs))
                orig_embs = np.squeeze(np.array(orig_embs))
                # tsne = TSNE(n_components=2, perplexity=10).fit_transform(embs)
                # plt.scatter(tsne[:,0], tsne[:,1], c=ratings)

                print("R-Div:", rdiv(orig_embs, embs))
                print("MMD:", MMD(embs, orig_embs))


llama2-base greedy
R-Div: 0.9124214603265018
MMD: 0.08716675104704857

llama2 greedy
R-Div: 0.9713736733054811
MMD: 0.024539472817600858

llama3-base greedy
R-Div: 0.9196652674468246
MMD: 0.08809214906296692

llama3 greedy
R-Div: 0.9757335273801896
MMD: 0.024333362718677632

llama2-base sampled
R-Div: 0.9172515675171973
MMD: 0.08599608765064282

llama2 sampled
R-Div: 0.9876249034603333
MMD: 0.013706707859002147

llama3-base sampled
R-Div: 0.9263676932415199
MMD: 0.0864085379602854

llama3 sampled
R-Div: 0.9911099231822099
MMD: 0.013244488271502777
