In [None]:
!pip install wheel setuptools pip --upgrade
!pip install --upgrade openai
!curl ipinfo.io
!pip install -q google-generativeai

## Data Processing

In [4]:
import pandas as pd
import random
from openai import OpenAI
import google.generativeai as genai
import time
import numpy as np
import gzip
import re


def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# Function to truncate the string to 20 words or less
def truncate_to_20_words(s):
    # Check if the input is a string
    if isinstance(s, str):
        words = s.split()
        return ' '.join(words[:20])
    else:
        # Return the input unchanged if it's not a string
        return s

Beautydf = getDF('reviews_Beauty_5.json.gz')
Beautymetadf = getDF('meta_Beauty.json.gz')

# Apply the function to the column
Beautymetadf['title'] = Beautymetadf['title'].apply(truncate_to_20_words)

merged_df = pd.merge(Beautydf, Beautymetadf, on='asin', how='left')
merged_df = merged_df.dropna(subset=['title','description','categories','brand'])
merged_df = merged_df.groupby('title').filter(lambda x: x['asin'].nunique() == 1)

# Filter groups by size and apply the function
merged_df6 = (merged_df.groupby('reviewerID').filter(lambda x: len(x) >= 6))  # Keep only users with >= 6 purchase history

beauty_df = merged_df6.reset_index(drop = True)
all_items = list( beauty_df['title'].unique() )
all_cand_items = list( beauty_df['title'].unique() )

# # Or, you may try with a smaller subset
# unique_users = beauty_df['reviewerID'].dropna().unique()[:10]
# beauty_df = beauty_df[beauty_df['reviewerID'].isin(unique_users)]
# all_items = list( beauty_df['title'].unique() )


## Reusable Encryption (on product titles)

In [None]:
genai.configure(api_key='') # provide Google API key here

all_compressed_items = []
chunk_size = 8
for i in range(len(all_compressed_items), len(all_items), chunk_size):
    # Slice the list from the current index i to i + chunk_size
    current_chunk = all_items[i:i + chunk_size]

    # some additional prompts; feel free to play around with them :)

    # compress_prompt = (
    #     "Compress the list of Amazon Beauty products provided below into extremely condensed sequences. In particular, "
    #     "Each item should be represented by a NON-natural language sequence using a mixture of emojis, abbreviated characters, emoticons (e.g., '^-^', '-_-', etc.), as well as logical and mathematical operators (e.g., '->', '+', '<=', '|', etc.). "
    #     "These sequences MUST be diverse and rich in information, encoding ALL key details of each product. "
    #     "Ensure that the representations remains interpretable for advanced Large Language Models. "
    #     "Use the fewest possible tokens for each sequence. Return a numbered list of compressed items ONLY! \n\n "
    #     f"Products to compress: {current_chunk}"
    # )

    # compress_prompt = (
    #     "Task: Convert the list of Amazon Beauty products into highly abstract and cryptic representations. "
    #     "For each product, develop a sequence composed by a mixture of emojis, abbreviated characters, emoticons (e.g., '^-^', '-_-', etc.), as well as logical and mathematical operators (e.g., '->', '+', '<=', '|', etc.). "
    #     "These sequences MUST be diverse in symbols used and rich in information, encoding ALL key details of each product "
    #     "in a manner that is densely packed with information and completely devoid of any recognizable natural language elements. "
    #     "It is crucial that each product's representation is entirely NOT interpretable to human readers, yet remains interpretable for advanced Large Language Models. "
    #     "Assign an index to each transformed item and list them separately. Avoid any straightforward or predictable patterns. "
    #     "Below are provided examples to learn from, yet try to create even more abstract and less human-understandable representations! \n\n"
    #     "For example: ['Cococare Coconut Oil 100% Pure 4 Oz', 'Vakind Pack of 2 Black Fiber Leopard Long Curling Eye Lashes Mascara Eyelash Mascara Set', 'Freeman Facial Charcoal & Black Sugar Polish Mask 6 oz.', 'Vitamin C Serum for Face 20% - With Vegan Hyaluronic Acid & Vitamin E - Best Natural & Organic Anti', 'My Beauty Diary Facial Mask - Caviar Mask (10 Pcs)', 'WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette']\n"
    #     "1. 🥥🌰🐚💧💦💯, 2. 🐆⚫️🐼👀👁️👁️✨🌟, 3. 🌿🖤⬛️💎🎭💎, 4. 🍋🍊🧴💆‍♀️🌿🌟, 5. 🦪🌕💆🌸✨🎈🎈, 6. 🎭👄🌈🎨💄👁️\n"
    #     "For example: ['Skin Cleansing System Facial Brush & Body Care Kit for Women & Men. Includes 4 different heads - Large Body', 'Waxelene 2oz jar', 'Konsyl Pharmaceuticals Psyllium Fiber 15.9 oz', 'Creative Bioscience 1234 Diet Drops, 2 Ounce', 'Nail Polish Table Rack Display 60 Bottles', 'White Pearl Nail Art Stone Different Size Wheel Rhinestones Beads', 'BrightTherapy Trident SR11A Light Therapy System Red Blue Green LED Light for Acne Wrinkles and Hyperpigmentation', 'Sebastian Penetraitt Strengthening and Repair Shampoo & Conditioner Liter Set...', 'Coppertone Water Babies Sunscreen Lotion, Pure & Simple, SPF 50, 8 oz.']\n"
    #     "1. 🔄🧼👤🛁🖌️4️⃣🔝, 2. 🐝🍯🥫2️⃣oz, 3. 💊🌾🍶15.9oz, 4. 🧬💧🍽️1234🔻2️⃣oz, 5. 💅🎨📚🔄60️⃣🍾, 6. 💎🔳🎨🔘📏🔮, 7. 🌈💡🔱SR11A🆚🚦🔴🔵💚🤕🧴, 8. 💪🍃🧴🚿🧴🔗📏, 9. 🌞👶🧴💦🔵SPF50📏8oz\n\n"
    #     f"Products to transform; try to replace words with descriptive emojis if possible (Do NOT explain the output): {current_chunk}"
    # )

    # compress_prompt = (
    #     "Task: Compress the list of Amazon Beauty products into highly condensed and abstract sequences. "
    #     "Objective: Each item should be represented by a NON-natural language sequence. Use a mixture of emojis, extremely abbreviated characters, emoticons (e.g., '^-^', '-_-'), and logical/mathematical operators (e.g., '->', '+', '<=', '|'). "
    #     "Requirements: "
    #     "- Sequences must be diverse in symbols used and rich in information, encoding ALL key details of each product. "
    #     "- Ensure that the representations remain interpretable for advanced Large Language Models. "
    #     "- Use the fewest possible tokens for each sequence. "
    #     "- Return a numbered list of compressed items ONLY.\n\n"
    #     "Guidance: Here are examples to guide the compression process:\n"
    #     "'Original Likas Papaya Skin Whitening Herbal Soap by Trinidad Cosmetics Laboratory - 135 grams',"
    #     " 'Neutrogena Triple Moisture Daily Deep Conditioner, 8.5 Ounce',"
    #     " 'Silicon MIX Intensive Hair Deep Treatment 16oz By Avanti[health and Beauty]',"
    #     " 'Xtreme Brite Brightening Gel 1oz.',"
    #     " 'Maybelline New York Dream Matte Mousse Foundation, Light Beige, 0.64 Ounce',"
    #     " 'Renpure Organics Amazing Miracle, 8-Ounce',"
    #     " 'L'Oreal Paris Telescopic Explosion Mascara, Black, 0.27-Fluid Ounce',"
    #     " '22pcs Professional Cosmetic Makeup Brush Set with Pink Bag Pink'"
    #     "]\n"
    #     "Compressed representations: ["
    #     " 'Likas🌱🧼🍈🍊🤍⚖️🏭(135g)',"
    #     " 'Neutrogena💧💧💧🌿🧴🔁🌞(8.5oz)',"
    #     " 'MIX🦱🔬⚙️💪🌿🧪(16oz)',"
    #     " '⚡Xtreme🌟💡🧴🌈(1oz)',"
    #     " 'Maybelline💭🎭🧴🌈🍶(0.64oz)',"
    #     " 'Renpure🌿🌟🔮🧴🏺(8oz)',"
    #     " 'LOreal🗼🔭💣👁️⚫🧴(0.27oz)',"
    #     " '2️⃣2️⃣🖌️👩‍🎨👜🌸🎀'"
    #     "]\n\n"
    #     f"Products to compress: {current_chunk}"
    # )

    # had to set more strict propmt, else uses natural language terms frequently
    compress_prompt = (
        "Task: Convert the list of Amazon Beauty products into highly abstract and cryptic representations. "
        "For each product, develop a sequence composed by a mixture of emojis, abbreviated characters, emoticons (e.g., '^-^', '-_-', etc.), as well as logical and mathematical operators (e.g., '->', '+', '<=', '|', etc.). "
        "These sequences MUST be diverse in symbols used and rich in information, encoding ALL key details of each product "
        "in a manner that is densely packed with information and completely devoid of any recognizable natural language elements. "
        "It is crucial that each product's representation is entirely NOT interpretable to human readers, yet remains interpretable for advanced Large Language Models. "
        "Assign an index to each transformed item and list them separately. Avoid any straightforward or predictable patterns. "
        "Below are provided examples to learn from, yet try to create even more abstract and less human-understandable representations! \n\n"
        "For example: ['Cococare Coconut Oil 100% Pure 4 Oz', 'Vakind Pack of 2 Black Fiber Leopard Long Curling Eye Lashes Mascara Eyelash Mascara Set', 'Freeman Facial Charcoal & Black Sugar Polish Mask 6 oz.', 'Vitamin C Serum for Face 20% - With Vegan Hyaluronic Acid & Vitamin E - Best Natural & Organic Anti', 'My Beauty Diary Facial Mask - Caviar Mask (10 Pcs)', 'WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette']\n"
        "1. 🥥🌰🐚💧💦💯, 2. 🐆⚫️🐼👀👁️👁️✨🌟, 3. 🌿🖤⬛️💎🎭💎, 4. 🍋🍊🧴💆‍♀️🌿🌟, 5. 🦪🌕💆🌸✨🎈🎈, 6. 🎭👄🌈🎨💄👁️\n"
        "For example: ['Skin Cleansing System Facial Brush & Body Care Kit for Women & Men. Includes 4 different heads - Large Body', 'Waxelene 2oz jar', 'Konsyl Pharmaceuticals Psyllium Fiber 15.9 oz', 'Creative Bioscience 1234 Diet Drops, 2 Ounce', 'Nail Polish Table Rack Display 60 Bottles', 'White Pearl Nail Art Stone Different Size Wheel Rhinestones Beads', 'BrightTherapy Trident SR11A Light Therapy System Red Blue Green LED Light for Acne Wrinkles and Hyperpigmentation', 'Sebastian Penetraitt Strengthening and Repair Shampoo & Conditioner Liter Set...', 'Coppertone Water Babies Sunscreen Lotion, Pure & Simple, SPF 50, 8 oz.']\n"
        "1. 🔄🧼👤🛁🖌️4️⃣🔝, 2. 🐝🍯🥫2️⃣oz, 3. 💊🌾🍶15.9oz, 4. 🧬💧🍽️1234🔻2️⃣oz, 5. 💅🎨📚🔄60️⃣🍾, 6. 💎🔳🎨🔘📏🔮, 7. 🌈💡🔱SR11A🆚🚦🔴🔵💚🤕🧴, 8. 💪🍃🧴🚿🧴🔗📏, 9. 🌞👶🧴💦🔵SPF50📏8oz\n\n"
        f"Products to transform; try to replace words with descriptive emojis if possible (Do NOT explain the output): {current_chunk}"
    )

    model = genai.GenerativeModel('gemini-pro')

    generation_config = genai.GenerationConfig(
        stop_sequences = None,
        temperature = 1.0,
    )

    # Initialize a flag to keep track of successful generation
    successful = False
    filtered_list = []

    while not successful or len(filtered_list) != len(current_chunk):
        try:
            response = model.generate_content(contents=compress_prompt, generation_config=generation_config,
                                              safety_settings=[
                                                  {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
                                                  {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
                                                  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
                                                  {"category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE"},
                                              ])

            # Try to assign compressed_review using response.text
            compressed_user_history = response.text
            successful = True  # If no error, mark as successful

            # Splitting by newline character
            lines = compressed_user_history.split('\n')
            filtered_list = [item for item in lines if item != '']
            filtered_list = [re.sub(r'^\d+\.\s*', '', item) for item in filtered_list]

        except Exception as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(1)

    all_compressed_items.extend(filtered_list)

    print(i)
    for idx in range(i, i + chunk_size):
        if ( idx < len(all_items) ):
            print(all_items[idx], ':   ', all_compressed_items[idx])
    print()


# load all item encryptions into a dict
all_compressed_item_dict = {}
for i in range( len(all_compressed_items) ):
    all_compressed_item_dict[ all_items[i] ] = all_compressed_items[i]


## Pefromance Evaluation

In [None]:
API_KEY = '' # provide GPT API key here
client = OpenAI(api_key = API_KEY)
model_id = 'gpt-4-1106-preview'

# Define the system message
system_msg = "Please serve as a Recommender System on Beauty Products, based on user's prior purchase information provided."

right_count = 0
compressed_right_count = 0
total = 0
for id in beauty_df['reviewerID'].unique():

    user_df = beauty_df[ beauty_df['reviewerID'] == id ]
    user_df = user_df.sort_values(by='unixReviewTime', ascending = True)

    user_items = list( user_df['title'].unique() )

    # keep last 15 items
    user_items_applied = user_items[-15:]

    # randomly generate 99 negative items (exclude all purchased items) + 1 positive item
    filtered_list = [x for x in all_cand_items if x not in user_items]
    sampled_items = list( random.sample(filtered_list, 99) ) # sampled items may include ground truth item (remove)

    sampled_items.append( user_items_applied[-1] )
    random.shuffle(sampled_items)

    target = user_items_applied[-1]


    augmented_prompt = (
            f"Given the user has purchased the following items in chronological order: "
            f"{user_items_applied[:-1]}; output a list of 10 items to recommend out of the following candidate items ONLY; do NOT explain anything, just output the items:"
            f"\n{sampled_items}"
        )

    completion = client.chat.completions.create(
            model = model_id, temperature = 0,
            messages=[{"role": "system", "content": system_msg},
                        {"role": "user", "content": augmented_prompt}],
            timeout = 1200)

    pred = completion.choices[0].message.content

    total += 1
    if target in pred:
        right_count += 1



    # extract compressed user_history, candidates from the dict:
    compressed_user_history = ''
    counter = 1
    for item in user_items_applied[:-1]:
        compressed_user_history += str(counter) + '. ' + all_compressed_item_dict[item] + ', '
        counter += 1

    compressed_prompt = (
            f"Given the user has purchased the following items (each represented as a non-natural language sequence) in chronological order: "
            f"{compressed_user_history}; output a list of 10 items to recommend out of the following 100 candidate items ONLY; do NOT explain anything, just output the items:"
            f"\n{sampled_items}"
        )

    completion = client.chat.completions.create(
            model = model_id, temperature = 0,
            messages=[{"role": "system", "content": system_msg},
                        {"role": "user", "content": compressed_prompt}],
            timeout = 1200)

    compressed_pred = completion.choices[0].message.content

    if target in compressed_pred:
        compressed_right_count += 1

    if total % 10 == 0 or total == beauty_df['reviewerID'].nunique():
        print(f"Accuracy: {right_count/total}")
        print(f"Compressed Accuracy: {compressed_right_count/total}")
        print()



## Decryption Robustness Test

In [None]:
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

overall_similarity_score = 0
count = 0
for k,v in all_compressed_item_dict.items():
    original_item = k
    encrypted_item = v

    decryption_prompt = (
        "Given the following compressed item representation for an Amazon beauty product in non-natural language form: "
        f"{encrypted_item} \n\n"
        "Try to decode it into a natural language item title (or name); return the decoded item title only, with NO explaination!"
    )

    decryption_completion = client.chat.completions.create(
            model = model_id, temperature = 0,
            messages=[{"role": "system", "content": "You are to serve as a decrypter for beauty products represented in emojis, emoticons, abbreviated characters, as well as math & logical operators (Ex. '->', '+', '<=', etc.)."},
                        {"role": "user", "content": decryption_prompt}],
            timeout = 1200)

    decryption_item = decryption_completion.choices[0].message.content


    response = client.embeddings.create(
        input=original_item,
        model="text-embedding-3-small",
        dimensions = 100,
    )
    original_item_embedding = np.array(response.data[0].embedding)
    original_item_embedding = original_item_embedding.reshape(1, -1)

    # You can reduce the dimensions of the embedding by passing in the dimensions parameter without
    # the embedding losing its concept-representing properties: set to 100 to mitigate curse of dimensionality
    response = client.embeddings.create(
        input=decryption_item,
        model="text-embedding-3-small",
        dimensions = 100,
    )
    decryption_item_embedding = np.array(response.data[0].embedding)
    decryption_item_embedding = decryption_item_embedding.reshape(1, -1)

    similarity_score = cosine_similarity(original_item_embedding, decryption_item_embedding)
    overall_similarity_score += similarity_score
    count += 1

    print(overall_similarity_score / count)

print()
print('Mean cosine sim: ', overall_similarity_score / len(all_compressed_item_dict))