In [None]:
%pip install ollama

In [None]:
%pip install matplotlib

In [3]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install nbimporter

Defaulting to user installation because normal site-packages is not writeable
Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install nbformat

ModuleNotFoundError: No module named 'vector_only'

In [2]:
!ollama list

NAME           ID              SIZE      MODIFIED     
qwen3:32b      030ee887880f    20 GB     27 hours ago    
gpt-oss:20b    aa4295ac10c3    13 GB     27 hours ago    
gemma3:12b     f4031aab637d    8.1 GB    27 hours ago    
llama3.2:1b    baf6a787fdff    1.3 GB    27 hours ago    
mistral:7b     6577803aa9a0    4.4 GB    27 hours ago    
r1-1776:70b    140ea940f21d    42 GB     6 days ago      
gemma3:270m    e7d36fb2c3b3    291 MB    6 days ago      


In [None]:
!ollama pull mistral:7b

In [None]:
!ollama pull llama3.2:1b

In [None]:
!ollama pull gemma3:12b

In [None]:
!ollama pull gpt-oss:20b

In [None]:
import json
from typing import List, Dict

def load_users_with_high_rated_books(
    jsonl_path: str,
    min_rating: int = 1
) -> List[Dict]:
    """
    Reads a JSONL file of users and extracts books with rating >= min_rating.

    Returns a list of dicts:
    {
        "user_id": int,
        "age": float,
        "location": str,
        "high_rated_books": [
            {"isbn": str, "rating": int},
            ...
        ]
    }
    """
    users = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                user = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping line {line_num}: JSON error → {e}")
                continue

            high_rated_books = [
                {
                    "isbn": book["ISBN"],
                    "rating": book["Rating"]
                }
                for book in user.get("Books", [])
                if book.get("Rating", 0) >= min_rating
            ]

            users.append({
                "user_id": user.get("User-ID"),
                "age": user.get("Age"),
                "location": user.get("Location"),
                "high_rated_books": high_rated_books
            })

    return users

In [None]:
jsonl_file = "/users/guest/a/annegg03/LLM_Semantics/personalization/Users_filtered.jsonl"

users = load_users_with_high_rated_books(jsonl_file, min_rating=1)

# Example: print first user
print(users[0])
print(f"Total users: {len(users)}")

{'user_id': 709, 'age': 14.0, 'location': 'roanoke, virginia, usa', 'high_rated_books': [{'isbn': '0064405052', 'rating': 9}, {'isbn': '0152162445', 'rating': 10}, {'isbn': '015216250X', 'rating': 10}, {'isbn': '0152162577', 'rating': 10}, {'isbn': '0345391802', 'rating': 7}, {'isbn': '0439042917', 'rating': 9}, {'isbn': '043922165X', 'rating': 9}, {'isbn': '0440219078', 'rating': 10}, {'isbn': '0440415993', 'rating': 10}, {'isbn': '0590897985', 'rating': 9}, {'isbn': '0689832877', 'rating': 10}, {'isbn': '1562477528', 'rating': 9}]}
Total users: 158


In [3]:
import json

def load_book_metadata(metadata: str) -> dict[str, dict]:
    isbn_to_metadata = {}

    with open(metadata, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping metadata line {line_num}: {e}")
                continue

            isbn = record.get("ISBN")
            if isbn:
                isbn_to_metadata[isbn] = record

    return isbn_to_metadata

In [4]:
def enrich_books_with_metadata(users: list[dict], metadata: dict[str, dict]) -> list[dict]:
    for user in users:
        enriched_books = []

        for book in user["high_rated_books"]:
            isbn = book["isbn"]
            meta = metadata.get(isbn)

            enriched_books.append({
                "isbn": isbn,
                "rating": book["rating"],
                "title": meta.get("Book-Title") if meta else None,
                "author": meta.get("Book-Author") if meta else None,
                "description": meta.get("description") if meta else None,
                "subjects": meta.get("LoC_subjects") if meta else None,
                "categories": meta.get("Google_categories") if meta else None,
            })

        user["high_rated_books"] = enriched_books

    return users

In [5]:
users = load_users_with_high_rated_books(
    "/users/guest/a/annegg03/LLM_Semantics/personalization/Users_filtered.jsonl",
    min_rating=7
)

metadata = load_book_metadata("/users/guest/a/annegg03/LLM_Semantics/personalization/combined_books_with_metadata.jsonl")

users = enrich_books_with_metadata(users, metadata)

In [7]:
import random

def build_random_book_pool(isbn_to_metadata: dict[str, dict]) -> list[dict]:
    """
    Returns a list of books with usable metadata for random sampling.
    """
    pool = []

    for meta in isbn_to_metadata.values():
        if (
            meta.get("Book-Title") is not None
            and meta.get("description") is not None
        ):
            pool.append({
                "isbn": meta.get("ISBN"),
                "title": meta.get("Book-Title"),
                "author": meta.get("Book-Author"),
                "description": meta.get("description"),
                "subjects": meta.get("LoC_subjects"),
                "categories": meta.get("Google_categories"),
            })

    return pool

In [18]:
random_book_pool = build_random_book_pool(metadata)

In [8]:
def select_preference_books(users: list[dict], k: int = 5) -> list[dict]:
    """
    Selects up to k highly rated books with usable metadata
    and stores them as user['preference_books'].
    """

    for user in users:
        # 1. Filter to books with usable metadata
        valid_books = [
            book for book in user.get("high_rated_books", [])
            if (
                book.get("title") is not None
                and book.get("description") is not None
            )
        ]

        # 2. Sort by rating (highest first)
        valid_books.sort(key=lambda b: b.get("rating", 0), reverse=True)

        # 3. Take top k
        user["preference_books"] = valid_books[:k]

    return users

In [9]:
users = select_preference_books(users, k=5)

In [21]:
def get_unused_high_rated_books(user: dict, k: int = 5) -> list[dict]:
    """
    Returns up to k high-rated books that were NOT used as preference context.
    """

    used_isbns = {b["isbn"] for b in user.get("preference_books", [])}

    unused = [
        b for b in user.get("high_rated_books", [])
        if (
            b["isbn"] not in used_isbns
            and b.get("title") is not None
            and b.get("description") is not None
        )
    ]

    unused.sort(key=lambda b: b.get("rating", 0), reverse=True)

    return unused[:k]

In [11]:
def sample_random_books(
    random_pool: list[dict],
    exclude_isbns: set[str],
    k: int = 5
) -> list[dict]:
    """
    Samples k random books, excluding ISBNs already seen by the user.
    """

    candidates = [
        book for book in random_pool
        if book["isbn"] not in exclude_isbns
    ]

    return random.sample(candidates, k)

In [12]:
def build_ranking_candidates(
    user: dict,
    random_pool: list[dict],
    positives_k: int = 5,
    random_k: int = 5
) -> list[dict]:
    """
    Returns 10 books:
    - 5 unused high-rated (positives)
    - 5 random books
    """

    positives = get_unused_high_rated_books(user, positives_k)

    seen_isbns = {
        b["isbn"] for b in user.get("preference_books", [])
    } | {
        b["isbn"] for b in positives
    }

    negatives = sample_random_books(
        random_pool,
        exclude_isbns=seen_isbns,
        k=random_k
    )

    return positives + negatives

In [29]:
import json

def load_emotion_vectors_with_all_books(jsonl_path: str) -> dict[int, dict]:
    """
    Loads user emotion vectors and preference books from the JSONL file.
    Returns a dict mapping user_id -> {"emotion_vector": dict, "preference_books": list}
    """
    emotion_data = {}
    
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            try:
                record = json.loads(line)
                user_id = record.get("user")
                emotion_vector = record.get("user_emotion_vector")
                preference_books = record.get("preference_books", [])
                
                if user_id and emotion_vector:
                    emotion_data[user_id] = {
                        "emotion_vector": emotion_vector,
                        "preference_books": preference_books
                    }
            except json.JSONDecodeError as e:
                print(f"Skipping line: JSON error → {e}")
                continue
    
    return emotion_data

In [31]:
# Load emotion vectors with preference books
emotion_data = load_emotion_vectors_with_all_books(
    "/users/guest/a/annegg03/LLM_Semantics/personalization/combination/user_emotion_vectors_for_all_books_with_preferences.jsonl"
)

# Enrich users with emotion vectors
for user in users:
    user_id = user["user_id"]
    if user_id in emotion_data:
        user["emotion_vector"] = emotion_data[user_id]["emotion_vector"]
    else:
        user["emotion_vector"] = {}

In [32]:
def get_all_remaining_books(user: dict) -> list[dict]:
    """
    Returns ALL high-rated books that were NOT used as preference context.
    """
    used_isbns = {b["isbn"] for b in user.get("preference_books", [])}

    remaining = [
        b for b in user.get("high_rated_books", [])
        if (
            b["isbn"] not in used_isbns
            and b.get("title") is not None
            and b.get("description") is not None
        )
    ]

    remaining.sort(key=lambda b: b.get("rating", 0), reverse=True)
    return remaining

In [33]:
def build_ranking_prompt_with_user_profile(
    preference_books: list[dict],
    ranking_books: list[dict],
    emotion_vector: dict = None,
) -> str:
    """
    Builds a prompt that creates a user profile from preference books and emotion vector,
    then asks the LLM to rank all remaining books.
    """

    # Shuffle ranking books to avoid order bias
    shuffled_books = ranking_books[:]
    random.shuffle(shuffled_books)

    # --- User Profile Section ---
    user_profile = "USER PROFILE\n\n"
    
    user_profile += "Books this user has rated highly:\n\n"
    for i, book in enumerate(preference_books, start=1):
        user_profile += f"{i}. {book['title']} by {book['author']} (Rating: {book['rating']}/10)\n"
    
    # --- Emotion vector context ---
    if emotion_vector:
        user_profile += "\nEmotional preferences derived from reading history:\n\n"
        for emotion, score in emotion_vector.items():
            user_profile += f"- {emotion}: {score:.2%}\n"

    # --- Ranking candidates ---
    candidate_lines = []
    for book in shuffled_books:
        candidate_lines.append(
            f"- {book['title']} by {book['author']} (Rating: {book['rating']}/10)\n"
        )

    # --- Final prompt ---
    prompt = f"""
You are a book recommendation system specializing in personalized rankings.

{user_profile}

Based on this user profile, rank the following books from MOST to LEAST likely to match this user's preferences.

Books to rank:

{chr(10).join(candidate_lines)}

Instructions:
- Rank ALL books listed above based on alignment with the user's demonstrated preferences and emotional profile
- Consider the themes, tone, and genres of the user's highly-rated books
- Factor in the user's emotional preferences when making your ranking
- Do NOT use general popularity or quality metrics

IMPORTANT CONSTRAINTS:
- Rank ALL books listed under "Books to rank"
- Output ONLY a ranked numbered list with book titles
- Do NOT introduce any new books
- Do NOT reference the preference books in your ranking

Output format:
Return ONLY a numbered ranked list with the book titles:

1. Book Title 1
2. Book Title 2
3. Book Title 3
...
"""

    return prompt

In [None]:
"""You are a recommendation system.
Your task is to rank books based on how likely a specific user would enjoy them, given their past preferences."""

In [34]:
import ollama

def run_ollama(prompt, model):
    response = ollama.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip()

In [35]:
def format_candidate_books(ranking_books, user):
    ratings_by_isbn = {
        b["isbn"]: b["rating"]
        for b in user.get("high_rated_books", [])
    }

    formatted = []
    for book in ranking_books:
        entry = {"title": book["title"]}
        rating = ratings_by_isbn.get(book["isbn"])
        if rating is not None:
            entry["user_rating"] = rating
        formatted.append(entry)

    return formatted

In [36]:
def run_sample_on_models_new(users_sample, models):
    """
    Run a sample of users through multiple models with new prompt approach.
    """
    results = []

    for user in users_sample:
        preference_books = user.get("preference_books", [])
        ranking_books = get_all_remaining_books(user)
        emotion_vector = user.get("emotion_vector", {})
        
        # Skip if no remaining books to rank
        if not ranking_books:
            continue
        
        prompt = build_ranking_prompt_with_user_profile(preference_books, ranking_books, emotion_vector)
        candidate_books = format_candidate_books(ranking_books, user)

        for model in models:
            try:
                output = run_ollama(prompt, model)
            except Exception as e:
                output = f"Error: {e}"

            results.append({
                "user_id": user["user_id"],
                "model": model,
                "prompt": prompt,
                "candidate_books": candidate_books,
                "llm_output": output
            })

    return results

In [37]:
users_sample = users
models = ['llama3.2:1b', 'mistral:7b', 'gemma3:12b'] #'gpt-oss:20b'

results = run_sample_on_models_new(
    users_sample,
    models
)

In [38]:
import json

with open("combined_ranking_without_descriptions_all_users_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)