# 1. Data and Dependencies

## 1.0. Dependencies

In [2]:
import os 
import gzip
import json
import networkx
import numpy as np
from scipy import sparse
import pandas as pd
import urllib.request 
import zipfile 

## 1.1. Paths

In [3]:
project_dir = os.path.dirname(os.getcwd()) 
data_dir = os.path.join(project_dir, "data") 
lookups_dir = os.path.join(project_dir, "lookups") 
graphs_dir = os.path.join(project_dir, "graph") 

## 1.2. Fetch data

In [4]:
def fetch_and_extract_zip(zip_path, download_url, extract_to=None, extract=False, rm_zip=False, gdrive=True): 
    """
    Check for a ZIP file at 'zip_path'. 
        If not present, download from 'download_url', and extract to 'extract_to' (default to the same directory as zip_path).
    Parameters: 
        zip_path (str): Full path to the local ZIP file. 
        download_url (str): URL to download the ZIP file from if not present.
        extract_to (str): Directory to extract the contest to (optional). 
        extract (bool): If False (default), keep the raw zip file. 
            If True, extract contents.
            Raw data should not be extracted but kept as zip file to be parsed directly on the fly. 
            Other data structures should generally be extracted. 
        gdrive (bool): True (default) if downloading from Google Drive folder.
        """
    # Set default extraction path to same directory as ZIP file 
    if extract_to is None: 
        extract_to = os.path.splitext(zip_path)[0] 

    # Check if zip already exists 
    if not os.path.exists(zip_path): 
        print(f"File not found at {zip_path}. Downloading from {download_url} ...") 
        os.makedirs(os.path.dirname(zip_path), exist_ok=True)
        if gdrive: 
            try: 
                import gdown 
            except ImportError: 
                raise ImportError("Please install gdown with pip install gdown.")
            if "drive.google.com" in download_url:
                gdown.download(download_url, zip_path, quiet=False) 
            else: 
                gdown.download(f"https://drive.google.com/uc?id={download_url}", zip_path, quiet=False)
        else:
            urllib.request.urlretrieve(download_url, zip_path) 
            
        print(f"Downloaded to {zip_path}") 
    else: 
        print(f"File already exists at {zip_path}") 
        return
    
    # Extract
    if extract: 
        print(f"Extracting ZIP to {extract_to}...") 
        with zipfile.ZipFile(zip_path, 'r') as zip_ref: 
            zip_ref.extractall(extract_to) 
        print("Extraction complete.") 
    
    if rm_zip: 
        try: 
            os.remove(zip_path) 
            print(f"Deleted ZIP file at {zip_path}")
        except OSError as e: 
            print(f"Error deleting ZIP file: {e}")


### 1.2.1 Graph

In [5]:
os.listdir(graphs_dir) 

['graph.pkl', 'graphs.zip', 'graph_2.pkl', 'graph_3.pkl', 'graph_3.zip']

In [6]:
graphs_url = r"https://drive.google.com/drive/folders/18SRrV1yxC9yBGbAGXCVBlBTIWkxA9rG4"
graph_zip_path = os.path.join(graphs_dir, "graph_3.zip") 
fetch_and_extract_zip(graph_zip_path, graphs_url, extract=True, extract_to=graphs_dir, rm_zip=True, gdrive=True)

File already exists at c:\Users\bened\DataScience\Autumn 2025\SINA\assignments\3\graph\graph_3.zip


Reload graph

In [7]:
import pickle  

graph_path = os.path.join(graphs_dir, "graph_3.pkl") 
with open(graph_path, 'rb') as f: 
    G = pickle.load(f) 

Sanity Check

In [8]:
user_nodes = [
    n for n, d in G.nodes(data=True) if d.get('bipartite') == 'user'
]
review_counts = {user: G.out_degree(user) for user in user_nodes} 
sorted_reviewers = sorted(
    review_counts.items(), key=lambda x: x[1], reverse=True
)
for user, count in sorted_reviewers[:10]: 
    print(f"User {user} wrote {count} reviews")

User A2NYK9KWFMJV4Y wrote 229 reviews
User A396ELNTQDFYPO wrote 165 reviews
User A3YVD62FSVIUJ wrote 160 reviews
User ADH0O8UVJOT10 wrote 133 reviews
User A2SUG35F6A6S3C wrote 105 reviews
User AZV2U6GU5QA6C wrote 95 reviews
User A1SD1C8XK3Z3V1 wrote 94 reviews
User A15TYOEWBQYF0X wrote 94 reviews
User A1FOXJ8TMYVKRK wrote 92 reviews
User ALHC64J88LVWO wrote 88 reviews


In [9]:
top10_degree_users = [] 
for user, _ in sorted_reviewers[:10]: 
    top10_degree_users.append(user) 
print(top10_degree_users)

['A2NYK9KWFMJV4Y', 'A396ELNTQDFYPO', 'A3YVD62FSVIUJ', 'ADH0O8UVJOT10', 'A2SUG35F6A6S3C', 'AZV2U6GU5QA6C', 'A1SD1C8XK3Z3V1', 'A15TYOEWBQYF0X', 'A1FOXJ8TMYVKRK', 'ALHC64J88LVWO']


In [10]:
product_nodes = [
    n for n, d in G.nodes(data=True) if d.get('bipartite') == 'product'
]
review_counts = {p: G.in_degree(p) for p in product_nodes} 
product_reviews_sorted = sorted(
    review_counts.items(), key=lambda x: x[1], reverse=True
)
for product, count in product_reviews_sorted[:10]: 
    print(f"Product {product} was reviewed {count} times")

Product B0002H03YY was reviewed 1771 times
Product B0006LOBA8 was reviewed 1768 times
Product B0002H05BA was reviewed 1766 times
Product B0002E3CK4 was reviewed 1760 times
Product B0002E1G5C was reviewed 1387 times
Product B0002E1NNC was reviewed 1382 times
Product B0002E1NWI was reviewed 1379 times
Product B0002H0A3S was reviewed 1196 times
Product B0007Y09VO was reviewed 1178 times
Product B004XNK7AI was reviewed 1092 times


### 1.2.2. Lookup Tables

In [11]:
os.listdir(lookups_dir)

['item_similarity_matrix.npz',
 'lookups.zip',
 'prediction_matrix.npz',
 'product_index.npy',
 'top5_recs.json',
 'user_index.npy',
 'user_similarity_matrix.npz',
 'utility_matrix.npz']

In [12]:
lookups_url = r"https://drive.google.com/file/d/13JynWjuvIOQYLXJgJamR5OatkN0dL-h8/view?usp=drive_link" 
lookups_zip_path = os.path.join(lookups_dir, "lookups.zip") 
fetch_and_extract_zip(lookups_zip_path, lookups_url, extract_to=lookups_dir, extract=True, rm_zip=True, gdrive=True) 

File already exists at c:\Users\bened\DataScience\Autumn 2025\SINA\assignments\3\lookups\lookups.zip


#### 1.2.2.1. Indexes 

In [13]:
product_index = np.load(os.path.join(lookups_dir, "product_index.npy"), allow_pickle=True) 
user_index = np.load(os.path.join(lookups_dir, "user_index.npy"), allow_pickle=True) 

#### 1.2.2.2. Utility Matrix

In [14]:
U_sparse = sparse.load_npz(os.path.join(lookups_dir, "utility_matrix.npz"))

U = pd.DataFrame.sparse.from_spmatrix(
    U_sparse, 
    index=product_index, 
    columns=user_index 
)
U.head() 

Unnamed: 0,A0072193KFP6LUHKEXLT,A0096681Y127OL1H8W3U,A0103849GBVWICKXD4T6,A0279100VZXR9A2495P4,A0600727NK5MAF66IOY5,A0727497OR0PPNFLFPDV,A07936821FOVJO6NP4Q8,A0833006NJW9KRF77ZFY,A0955928C2RRWOWZN7UC,A10044ECXDUVKS,...,AZYCGMFCK9AIM,AZYJTD9J82V5I,AZYP4FQ2L2C4O,AZZ3WYDJ0XNZW,AZZCLFV6V8693,AZZM5MUOG0LRK,AZZT9G4MJFCHD,AZZX23UGJGKTT,AZZZ3LGTCGUZF,AZZZG8PGB1FS0
739079891,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
786615206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1480360295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1928571018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9792372326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Sanity Check

In [15]:
# U_dense = U.sparse.to_dense() 

# ratings_counts = (U_dense > 0).sum(axis=0)
# ratings_counts_summary = ratings_counts.sort_values(ascending=False).reset_index() 
# ratings_counts_summary.columns = ['User', 'Number of actual ratings']
# ratings_counts_summary

In [16]:
# del U_dense

#### 1.2.2.3. User-Similarity Matrix 

In [17]:
S_sparse = sparse.load_npz(os.path.join(lookups_dir, "user_similarity_matrix.npz")) 

S = pd.DataFrame.sparse.from_spmatrix(
    S_sparse, 
    index=user_index,
    columns=user_index
)
S.head()

Unnamed: 0,A0072193KFP6LUHKEXLT,A0096681Y127OL1H8W3U,A0103849GBVWICKXD4T6,A0279100VZXR9A2495P4,A0600727NK5MAF66IOY5,A0727497OR0PPNFLFPDV,A07936821FOVJO6NP4Q8,A0833006NJW9KRF77ZFY,A0955928C2RRWOWZN7UC,A10044ECXDUVKS,...,AZYCGMFCK9AIM,AZYJTD9J82V5I,AZYP4FQ2L2C4O,AZZ3WYDJ0XNZW,AZZCLFV6V8693,AZZM5MUOG0LRK,AZZT9G4MJFCHD,AZZX23UGJGKTT,AZZZ3LGTCGUZF,AZZZG8PGB1FS0
A0072193KFP6LUHKEXLT,1.0,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,0,0.14602
A0096681Y127OL1H8W3U,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,0,0.0
A0103849GBVWICKXD4T6,0.0,0.0,1.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,0,0.0
A0279100VZXR9A2495P4,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0.161796,0,0,0,0.0
A0600727NK5MAF66IOY5,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,0,0.0


#### 1.2.2.4. Item-Similarity Matrix

In [18]:
I_sparse = sparse.load_npz(os.path.join(lookups_dir, "item_similarity_matrix.npz")) 

I = pd.DataFrame.sparse.from_spmatrix(
    I_sparse, 
    index=product_index, 
    columns=product_index 
)
I.head()

Unnamed: 0,0739079891,0786615206,1480360295,1928571018,9792372326,B00000J50W,B00001W0DH,B00001W0DT,B00004TT3S,B00004UE29,...,B01HECB4AG,B01HED0HO4,B01HG0FXAI,B01HG293XO,B01HGLUP14,B01HHJXKRG,B01HHZAU3W,B01HI1VM28,B01HI5M4GW,B01HIDOPP2
739079891,1.0,0.0,0.037687,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
786615206,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1480360295,0.037687,0.0,1.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1928571018,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9792372326,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### 1.2.2.5. Prediction Matrix

In [19]:
P_sparse = sparse.load_npz(os.path.join(lookups_dir, "prediction_matrix.npz")) 

P = pd.DataFrame.sparse.from_spmatrix(
    P_sparse, 
    index=product_index,
    columns=user_index
)
P.head()

Unnamed: 0,A0072193KFP6LUHKEXLT,A0096681Y127OL1H8W3U,A0103849GBVWICKXD4T6,A0279100VZXR9A2495P4,A0600727NK5MAF66IOY5,A0727497OR0PPNFLFPDV,A07936821FOVJO6NP4Q8,A0833006NJW9KRF77ZFY,A0955928C2RRWOWZN7UC,A10044ECXDUVKS,...,AZYCGMFCK9AIM,AZYJTD9J82V5I,AZYP4FQ2L2C4O,AZZ3WYDJ0XNZW,AZZCLFV6V8693,AZZM5MUOG0LRK,AZZT9G4MJFCHD,AZZX23UGJGKTT,AZZZ3LGTCGUZF,AZZZG8PGB1FS0
739079891,0.001417,0.002249,0,0,0,0,0.005686,0.0,0.0,0.002617,...,0.012907,0.0,0,0.00569,0.005153,0,0.0,0.0,0.0,0.0
786615206,0.0,0.006382,0,0,0,0,0.0,0.0,0.0,0.001126,...,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0
1480360295,0.007739,0.002601,0,0,0,0,0.003764,0.027382,0.00357,0.003489,...,0.020344,0.001097,0,0.001751,0.0,0,0.000882,0.0,0.0,0.019288
1928571018,0.002,0.000598,0,0,0,0,0.000778,0.0,0.0,0.000233,...,0.0,0.0,0,0.000692,0.007247,0,0.0,0.0,0.0,0.001216
9792372326,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.001631,...,0.0,0.022063,0,0.000986,0.0,0,0.002676,0.030343,0.015025,0.008067


Sanity Check

In [20]:
P.shape

(10620, 27530)

In [21]:
# P_dense = P.sparse.to_dense() 

# ratings_counts = (P_dense == -1).sum(axis=0)
# ratings_counts_summary = ratings_counts.sort_values(ascending=False).reset_index() 
# ratings_counts_summary.columns = ['User', 'Number of actual ratings']
# ratings_counts_summary

In [22]:
# del P_dense

#### 1.2.2.6. Recommendations lookup

In [23]:
os.listdir(lookups_dir) 

['item_similarity_matrix.npz',
 'lookups.zip',
 'prediction_matrix.npz',
 'product_index.npy',
 'top5_recs.json',
 'user_index.npy',
 'user_similarity_matrix.npz',
 'utility_matrix.npz']

In [24]:
import json 

recs_path = os.path.join(lookups_dir, "top5_recs.json") 
with open(recs_path, 'r') as f: 
    recs = json.load(f) 

In [25]:
print(len(recs))

27530


# 2. Info Retrieval

Fields relevant to prompting

In [26]:
relevant_review_keys = [
    'overall',
    'reviewText',
    'summary',
    'vote',
    'verified'  # still don't know what this means! but it seems important! 
] 
relevant_product_keys = [
    'asin', # not useful to the LLM but useful to keep track of
    'title', 
    'brand', 
    'price', 
    'category', 
    'description', 
    'feature'
    ]

In [27]:
# helper to rank reviews by similarity score 
def rank_reviews(reviews, similarity_vector): 
    # sort by similarity 
    sorted_keys = sorted(
        reviews,
        key=lambda x: similarity_vector[x],
        reverse=True 
        )
    # replace IDs with ranks 
    ranked_reviews = {
        rank + 1: reviews[key] for rank, key in enumerate(sorted_keys)
    }
    return ranked_reviews

In [28]:
# Helper Function to collect reviews by similarity measure
def collect_reviews(user_id, product_id, similarity='user', utility_matrix=U, item_matrix=I, user_matrix=S, graph=G, num_reviews=1): 
    
    if similarity == 'user': # looking for similar users to user_id who have rated product_id
        relevant_nodes = utility_matrix.loc[product_id][utility_matrix.loc[product_id] > 0].index.tolist()
        # print(f"Users who have rated {product_id}: {relevant_nodes}")  # Debugging statement 
        similarity_vector = user_matrix.loc[relevant_nodes, user_id] 
        similar_nodes = similarity_vector.sort_values(ascending=False).index.tolist()
        # print(f"Users who have rated {product_id} most similar to {user_id}: {similar_nodes}")  # Debugging statement
        target_node = product_id # we'll be looking for the reviews of product_id from the users in similar_nodes

    elif similarity == 'item':  # looking for similar products to product_id rated by user_id
        relevant_nodes = utility_matrix.loc[:, user_id].gt(0).index.tolist()
            # print(f"Products rated by {user_id}: {relevant_nodes}") # Debugging statement 
        similarity_vector = item_matrix.loc[relevant_nodes, product_id]
        similar_nodes = similarity_vector.sort_values(ascending=False).index.tolist() 
        # print(f"Products rated by {user_id} most similar to {product_id}: {similar_nodes}")   # Debugging statement
        target_node = user_id # we'll be looking for reviews of similar_nodes (items) from the user_id
    else: 
        raise ValueError("nodetype parameter must be 'user' (default) or 'item'")
    
    done = False
    rating_to_beat = 0 
    reviews = {}

    for s in similar_nodes: 

        if done: 
            return reviews 
        # if reviews contains the number of reviews needed, check whether they're all five star ratings 
        elif len(reviews) == num_reviews: 
            num_good_reviews = 0 
            for r in reviews: 
                if num_good_reviews == num_reviews: 
                    done = True 
                    break
                if reviews[r]['overall'] != 5: 
                    rating_to_beat = reviews[r]['overall']
                    break 
                elif reviews[r]['overall'] == 5:
                    num_good_reviews += 1 
                else:  # if no overall rating, remove this review 
                    del reviews[r] 
        
        if not graph.has_node(s): 
            print(f"Node {s} not in graph") 
            continue 
        # Now we want to find 
        # collect the edge set of node s 
        ## if s is user node, we want the out set 
        ## if s is an item node, we want the in set
        edges = (
            graph.out_edges(s, data=True) 
            if similarity == 'user'  # for user similarity we want out edges
            else graph.in_edges(s, data=True) # and in edges for item similarity
        )
        # iterate through source, target and edge in the edge set of s
        # if similarity is USER, the edge set is the out set of similar users and the target node is product_id
        ## which means the product_id (target_node) is v 

        # conversely, if similarity is ITEM, the edge set is the in set of similar items 
        ## and the target_node is the user_id
        ## which means the the user_id is u
        for u, v, data in edges: 
            if similarity == 'user': 
                connected_node = v 
                key = u
            else: 
                connected_node = u 
                key = v 
            if connected_node != target_node: 
                continue 
            if 'reviewText' not in data: 
                continue 
            if data.get('overall') > rating_to_beat: 
                review_info = {k:v for k, v in data.items() if k in relevant_review_keys}
                # key reviews with IDs -- we'll use later to rank 
                reviews[key] = review_info

                if len(reviews) > num_reviews: 
                    sorted_keys = sorted(reviews, key=lambda x: reviews[x]['overall']) 
                    top_keys = sorted_keys[1:] 
                    reviews = {k: reviews[k] for k in top_keys}
    
    # sort reviews by similarity and rank them 
    ranked_reviews = rank_reviews(reviews, similarity_vector) 

    return ranked_reviews

Helper function to find most helpful review of product (regardless of user similarity) 

In [46]:
def find_most_helpful_review(product_id, num_reviews): 
    """
    Find the most helpful review (by votes) for this product.
    """
    # collect the edge set of product_id 
    in_edges = G.in_edges(product_id, data=True)

    for rating in range(5, 0, -1): 
        most_votes = -1 
        helpful_reviews = [] 

        for _, _, review in in_edges: 
            # find most helpful five star review 
            if review.get('overall') == rating:
                votes = review.get('votes', 0) 
                if votes > most_votes: 
                    most_votes = votes
                    helpful_reviews.append({k:v for k, v in review.items() if k in relevant_review_keys})
        
        if len(helpful_reviews) == num_reviews:
            # rank reviews by number of votes 
            reviews_sorted = sorted(helpful_reviews, key=lambda x: x.get('votes'), reverse=True)
            # format as dict
            helpful_reviews = {idx+1: review for idx, review in enumerate(reviews_sorted)}
            
            return helpful_reviews
        
    return None

Below is the main function for collecting the prompting info. 

In [47]:
def find_prompt_info(user_id, graph=G, num_product_reviews=1, num_user_reviews=1, num_helpful_reviews=1, utility_matrix=U, user_similarity_matrix=S, item_similarity_matrix=I, recommendations=recs): 
    
    recommended_product_id = recommendations[user_id][0] 
    product_reviews = collect_reviews(user_id, recommended_product_id, similarity='user', num_reviews=num_user_reviews) # review of product by similar user 
    item_reviews = collect_reviews(user_id, recommended_product_id, similarity='item', num_reviews=num_product_reviews)  # reviews by user of similar products 
    most_helpful_review = find_most_helpful_review(recommended_product_id, num_reviews=num_helpful_reviews) 
    
    product_info = {
        k: v for k, v in G.nodes[recommended_product_id].items() if k in relevant_product_keys
    }
    # structure info in consistent format 
    info = {
        'Product': product_info, 
        'Reviews by similar users': product_reviews, 
        'Helpful review of product': most_helpful_review,
        'Reviews of similar products by user': item_reviews 
    }
    
    return recommended_product_id, info

# 3. Prompting

Initial prompt to explain task

In [36]:
task_prompt = """
### TASK DESCRIPTION: 
Your task is to generate short recommendation justifications Amazon customers based on information that will be provided to you following this TASK DESCRIPTION.
The products are musical instruments. 
The information you will be given is structured in JSON format. 
The structure is as follows:
{
    Product: {<key-value pairs providing information on the recommended product>},
    Reviews by similar users: {
            <Ranked similarity with user, with 1 being the highest rank>: <review data as key value pairs>{
            <overall>: <rating this user gave to the product>, 
            <verified>: <if True, this review is known to derive from a trusted source>, 
            <reviewText>: <the text of the review itself>, 
            <summary>: <a brief summary of the review>
            },
    Reviews of similar product by this customer: {<same structure as above. These reviews are of the products most similar to the recommended product, and are written by the recomendee user.>}, 
    Helpful review of the product: {<same structure as above. This review of the product was voted most helpful by other users.>},
}
Your task is to synthesize this information into a justification of why the customer may like this product. 
The length of your justification can vary depending on the amount of information you have to go on, but should not exceed a paragraph. 
You should avoid repeating yourself. 
You can include technical jargon related to the products, but should avoid other kinds of technical jargon. For example, you should not refer to "users" and "products" but prefer more approachable terms.
The next part of the prompt will contain the structured data:
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
"""

## 3.1. generate() function

<b> This is the function you'll have to change for different APIs. Currently configured for Google AI studio. </b> 

In [56]:
import base64
from google import genai 
from google.genai import types
from dotenv import load_dotenv 

env_path = os.path.join(project_dir, ".env") 
load_dotenv(env_path) 

def generate(info, response_type="text/plain", model="gemini-2.5-flash-preview-04-17", api_key="GOOGLE_API_KEY"):

    client = genai.Client(
        api_key=os.environ.get(api_key)
    ) 
    # append current prompt to chat_history
    contents = [
        types.Content(
            role='user',
            parts=[
                types.Part.from_text(text=task_prompt),
                types.Part.from_text(text=info)
            ]
        )
    ]
    # contents of API call is the whole chat_history
    generate_content_config = types.GenerateContentConfig(
        response_mime_type=response_type,
    )
    full_response_text = ""
    try:
        for chunk in client.models.generate_content_stream(
            model=model, 
            contents=contents,
            config=generate_content_config,
        ):
            full_response_text += chunk.text
            print(chunk.text, end="") 
            print()
    except Exception as e: 
        print(f"\nAn error occurred during content generation: {e}") 
        return _, e
    
    return model, full_response_text

# Main Function

Global data storage

In [29]:
rows = [] 

In [None]:
import time

def update(data, field, value): 
    if field in data: 
        data[field].append(value)
    else: 
        data[field] = [value]

def LLM_attribute(users, graph=G, sleep=0, print_out=True, num_product_reviews=1, num_user_reviews=1, num_helpful_reviews=1, **kwargs): 
    """
    Pass a list of user IDs to this function. 
    The function will attribute the graph with the model_name+number_of_examples and the recommendation justification. 
    It will return a dataframe to allow easier inspection of performance. 
    Adjust the sleep parameter to avoid rate limits. 
    num_product_reviews: "find this many reviews of the recommended product from the most similar users to user_id"
    num_user_reviews:  "find this many reviews from the user_id of the most similar products to the recommended one"
    num_helpful_reviews:  "find this many reviews of the recommended product which are generally helpful and positive (have highest number of votes)"
    """

    if isinstance(users, str): 
        users = [users]
    # call find_prompt_info to collect relevant data 
    for user in users:

        data = {} 
        data['user'] = user

        product_id, recommendation_info = find_prompt_info(user, num_product_reviews=num_product_reviews, num_user_reviews=num_user_reviews, num_helpful_reviews=num_helpful_reviews) 
        if print_out: 
            print("*" * 50, "\n") 
            print("Raw Info:\n") 
            print(recommendation_info) 
            print("*" * 50) 

        product = recommendation_info['Product'].get('title', None)
        if product is None: 
            product = product_id 
        data['Product'] = product 

        # convert to text for LLM 
        info_text = json.dumps(recommendation_info, indent=2) 

        # call generate to return the recommendation tip  
        print("\n Recommendation Justification:\n")
        model, justification = generate(info_text) 
        if 'error' in justification: 
            print("Error encountered, breaking loop early.") 
            return data
        
        data['Model'] = model
        data['Num Product Reviews'] = num_product_reviews
        data['Num User Reviews'] = num_user_reviews
        data['Num Helpful Reviews'] = num_helpful_reviews
        data['Justification'] = justification
        
        # format the attribute name as model + n-shots
        n_shots = num_product_reviews + num_user_reviews + 1 
        attr_key = f"{model} -- {n_shots}-shot prompting"

        # find the Product-Recommended->User edge 
        for u, v, d in G.in_edges(user, data=True): 
            if u != product_id:
                continue 
            else: 
                d[attr_key] = justification
                print(f"{u}-Recommended->{v} edge updated with {model}: {justification}")
                break
        
        rows.append(data)

        time.sleep(sleep) 
    
    return "Justifications generated for all users"

## 3.1. Gemini

Test on one reviewer

In [None]:
reviewers_by_degree = [x[0] for x in sorted_reviewers] 

['A2NYK9KWFMJV4Y',
 'A396ELNTQDFYPO',
 'A3YVD62FSVIUJ',
 'ADH0O8UVJOT10',
 'A2SUG35F6A6S3C',
 'AZV2U6GU5QA6C',
 'A1SD1C8XK3Z3V1',
 'A15TYOEWBQYF0X',
 'A1FOXJ8TMYVKRK',
 'ALHC64J88LVWO',
 'A1MVH1WLYDHZ49',
 'AJK15Q9JOEHRH',
 'A1GM01HSKCW77K',
 'A2U53XJZLPQWX2',
 'A1XEX009ZR6WXG',
 'A20JJ8634DG3FS',
 'A3G9836LYVXE7A',
 'A1PNV7C43HXKDI',
 'A1MHPGANXXGLC8',
 'A1T0CLFXA77001',
 'A2VDKEI5V937X4',
 'A1RPTVW5VEOSI',
 'A273KCH4QQ8VR0',
 'A1ESUYMIG3TZT0',
 'A1VPJT3BHZ9UAS',
 'A1DVUFG2QSJ6IK',
 'AKYDGCKCY7H9F',
 'A3EWG5C5KW5W9C',
 'A3M1PLEYNDEYO8',
 'A1A6VRGDEUFSPQ',
 'A1OGTL9EBDTK3N',
 'A3AOPVQ7EZHTWA',
 'A182VFGL9YR6SZ',
 'A3DILTDN3BLZP0',
 'A3C1GOWBI1PCB6',
 'A1F1C6B24D97CW',
 'A3H4S6YO9VNGDZ',
 'A129WUT5OMBKG',
 'A3J6QCWLBVP3HX',
 'A2EZWZ8MBEDOLN',
 'AKHWZ3S1UVZAO',
 'A2I7LG2P04A2IB',
 'A1EVEQAR05SY7L',
 'A1XSJ80M39HD7G',
 'A38TYXTNN72TK8',
 'A22Z554ZQ8NFPC',
 'A2T4ET67G37VE6',
 'A1DXBEKAZ0K6FI',
 'AWY308ST6QBNK',
 'A1GMWTGXW682GB',
 'A2EDYSY4M8TESS',
 'A2MZ8IKTPCMKPT',
 'A134J0S1HWE4F7',
 'A

In [34]:
busiest_reviewer = top10_degree_users[0]

In [42]:
top_reviewer_data = LLM_attribute(busiest_reviewer) 

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

In [43]:
two_shot_tip = LLM_attribute(busiest_reviewer, num_product_reviews=2, num_user_reviews=2)

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

three examples

In [45]:
three_shot_tip = LLM_attribute(busiest_reviewer, num_product_reviews=3, num_user_reviews=3)

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

four examples

In [50]:
four_shot_tip = LLM_attribute(busiest_reviewer, num_product_reviews=4, num_user_reviews=4, num_helpful_reviews=2)

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

In [51]:
five_shot_tip = LLM_attribute(busiest_reviewer, num_product_reviews=5, num_user_reviews=5, num_helpful_reviews=5)

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

## Loop

Running now until we hit the rate limit, going down the list of users by degree.

In [54]:
gemini_5shot_justifications = LLM_attribute(reviewers_by_degree, num_product_reviews=5, num_user_reviews=5, num_helpful_reviews=5)

************************************************** 

Raw Info:

{'Product': {'category': ['Musical Instruments', 'Live Sound & Stage', 'Stage & Studio Cables', 'Microphone Cables'], 'description': ['The CBI MLC Cables are built to last and priced to save you big money! An excellent Low-Z microphone cable, the CBI MLC offers high quality XLR male and female connectors and is compatible with any XLR connection-microphones, mixers, recorders. ..or anywhere else you need a high quality XLR connection.'], 'title': 'CBI MLC LowZ XLR Male to XLR Female Microphone Cable, 20 Feet', 'brand': 'CBI Cables', 'feature': ['Microphone cable for studio recording and live sound', 'High-quality XLR male and female connectors; 3-pin XLR male to XLR female', 'Protective metal housing; flexible PVC jacket with 6.0mm outer diameter', 'All copper conductors and inner copper spiral shielding; shielded cable reduces interference and excess noise', 'Measures 20-feet long; backed by a CBI Cables 10-Year Limited W

KeyboardInterrupt: 

In [59]:
gemini_5shot_df = pd.DataFrame(rows)
gemini_5shot_df.head()

Unnamed: 0,user,Product,Model,Num Product Reviews,Num User Reviews,Justification,Num Helpful Reviews
0,A2NYK9KWFMJV4Y,CBI MLC LowZ XLR Male to XLR Female Microphone...,gemini-2.5-flash-preview-04-17,1,1,Based on your experience with gear like mic an...,
1,A2NYK9KWFMJV4Y,CBI MLC LowZ XLR Male to XLR Female Microphone...,gemini-2.5-flash-preview-04-17,2,2,Based on your interest in musical equipment li...,
2,A2NYK9KWFMJV4Y,CBI MLC LowZ XLR Male to XLR Female Microphone...,gemini-2.5-flash-preview-04-17,3,3,This CBI microphone cable could be a great add...,
3,A2NYK9KWFMJV4Y,CBI MLC LowZ XLR Male to XLR Female Microphone...,gemini-2.5-flash-preview-04-17,4,4,Based on your experience with different musica...,2.0
4,A2NYK9KWFMJV4Y,CBI MLC LowZ XLR Male to XLR Female Microphone...,gemini-2.5-flash-preview-04-17,5,5,Based on your experience valuing dependable ge...,5.0


save graph 

In [67]:
with open(os.path.join(graphs_dir, "graph_gemini-5shot.pkl"), 'wb') as f: 
    pickle.dump(G, f) 

# <i> other API approaches </i>

Through openrouter. 

<i> Note: </i> openrouter.ai is useful in that you can make calls to many different LLM APIs through the same service. The downside is that your traffic is routed through openrouter, so that rate limits are applied on ALL calls coming through openrouter, not just ours. If you hit the rate limit, you may need to either change the model to a less popular one, or try making calls directly to the source. 

In [38]:
import requests
import json 
from dotenv import load_dotenv 

env_path = os.path.join(project_dir, ".env") 
load_dotenv(env_path) 

openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions", 
    headers={
        "Authorization": f"Bearer {openrouter_api_key}",
        "Content-Type": "application/json" 
    }, 
    data = json.dumps({
        "model": "google/gemini-2.0-flash-exp:free",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text", 
                        "text": task_prompt
                    }
                ]
            }
        ]
    })
)

print("---Prompt---") 
print(task_prompt)
print("\n---Model Response ---") 
try: 
    result = response.json() 
    message = result['choices'][0]['message']['content'] 
    print(message) 
except Exception as e: 
    print("Error extracting model response:", e) 
    print("Raw response:", response.text) 

---Prompt---

### TASK DESCRIPTION: 
Your task is to generate short recommendation tips for Amazon users based on information that will be provided to you in follow up prompts.
The information you will be given is structured in JSON format. 
The structure is as follows:
{
    Product: {<key-value pairs providing information on the recommended product>},
    Reviews by similar users: {
        <Ranked similarity with user, with 1 being the highest rank>: <review data as key value pairs>{
            <overall>: <rating this user gave to the product>, 
            <verified>: <if True, this review is known to derive from a trusted source>, 
            <reviewText>: <the text of the review itself>, 
            <summary>: <a brief summary of the review>
            },
    Reviews of similar product by user: {<same structure as above. These reviews are of the products most similar to the recommended product, and are written by the recomendee user.>}, 
    Helpful review of product: {<same 

In [37]:
import requests
import json 
from dotenv import load_dotenv 
from openai import OpenAI 

env_path = os.path.join(project_dir, ".env") 
load_dotenv(env_path) 

openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

client = OpenAI(
    base_url='https://openrouter.ai/api/v1',
    api_key = openrouter_api_key
)

completion = client.chat.completions.create(
    model="google/gemini-2.0-flash-exp:free", 
    messages=[
        {
            "role": "user", 
            "content": [
                {
                    "type": "text",
                    "text": task_prompt
                }
            ]
        }
    ]
)
try: 
    message = completion.choices[0].message.content 
    print(message) 
except Exception as e: 
    print("Error extracting model response:", e) 
    print("Raw response:", completion.error['message']) 

Okay, I understand. I'm ready to receive the JSON data and will synthesize it into a short, personalized recommendation tip for the Amazon user. I will focus on extracting the most relevant information from the provided data to create a concise and helpful tip. I'll pay attention to:

*   **Product details:** Key aspects of the product that might appeal to the user.
*   **Similar user reviews:** Common themes, positive and negative, from users with similar tastes.
*   **User's past reviews of similar products:** How the user has reacted to similar items in the past.
*   **Helpful review:** Key points from the review that others have found most valuable.

Let's begin! I'm eager to see the data.

