In [None]:
%pip install ollama

In [None]:
%pip install matplotlib

In [3]:
%pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install nbimporter

Defaulting to user installation because normal site-packages is not writeable
Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install nbformat

ModuleNotFoundError: No module named 'vector_only'

In [2]:
!ollama list

NAME           ID              SIZE      MODIFIED     
qwen3:32b      030ee887880f    20 GB     27 hours ago    
gpt-oss:20b    aa4295ac10c3    13 GB     27 hours ago    
gemma3:12b     f4031aab637d    8.1 GB    27 hours ago    
llama3.2:1b    baf6a787fdff    1.3 GB    27 hours ago    
mistral:7b     6577803aa9a0    4.4 GB    27 hours ago    
r1-1776:70b    140ea940f21d    42 GB     6 days ago      
gemma3:270m    e7d36fb2c3b3    291 MB    6 days ago      


In [None]:
!ollama pull mistral:7b

In [None]:
!ollama pull llama3.2:1b

In [None]:
!ollama pull gemma3:12b

In [None]:
!ollama pull gpt-oss:20b

In [2]:
import json
from typing import List, Dict

def load_users_with_high_rated_books(
    jsonl_path: str,
    min_rating: int = 7
) -> List[Dict]:
    """
    Reads a JSONL file of users and extracts books with rating >= min_rating.

    Returns a list of dicts:
    {
        "user_id": int,
        "age": float,
        "location": str,
        "high_rated_books": [
            {"isbn": str, "rating": int},
            ...
        ]
    }
    """
    users = []

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                user = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping line {line_num}: JSON error â†’ {e}")
                continue

            high_rated_books = [
                {
                    "isbn": book["ISBN"],
                    "rating": book["Rating"]
                }
                for book in user.get("Books", [])
                if book.get("Rating", 0) >= min_rating
            ]

            users.append({
                "user_id": user.get("User-ID"),
                "age": user.get("Age"),
                "location": user.get("Location"),
                "high_rated_books": high_rated_books
            })

    return users

In [3]:
jsonl_file = "Users_filtered.jsonl"

users = load_users_with_high_rated_books(jsonl_file, min_rating=7)

# Example: print first user
print(users[0])
print(f"Total users: {len(users)}")

{'user_id': 709, 'age': 14.0, 'location': 'roanoke, virginia, usa', 'high_rated_books': [{'isbn': '0064405052', 'rating': 9}, {'isbn': '0152162445', 'rating': 10}, {'isbn': '015216250X', 'rating': 10}, {'isbn': '0152162577', 'rating': 10}, {'isbn': '0345391802', 'rating': 7}, {'isbn': '0439042917', 'rating': 9}, {'isbn': '043922165X', 'rating': 9}, {'isbn': '0440219078', 'rating': 10}, {'isbn': '0440415993', 'rating': 10}, {'isbn': '0590897985', 'rating': 9}, {'isbn': '0689832877', 'rating': 10}, {'isbn': '1562477528', 'rating': 9}]}
Total users: 180


In [4]:
import json

def load_book_metadata(metadata: str) -> dict[str, dict]:
    isbn_to_metadata = {}

    with open(metadata, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping metadata line {line_num}: {e}")
                continue

            isbn = record.get("ISBN")
            if isbn:
                isbn_to_metadata[isbn] = record

    return isbn_to_metadata

In [5]:
def enrich_books_with_metadata(users: list[dict], metadata: dict[str, dict]) -> list[dict]:
    for user in users:
        enriched_books = []

        for book in user["high_rated_books"]:
            isbn = book["isbn"]
            meta = metadata.get(isbn)

            enriched_books.append({
                "isbn": isbn,
                "rating": book["rating"],
                "title": meta.get("Book-Title") if meta else None,
                "author": meta.get("Book-Author") if meta else None,
                "description": meta.get("description") if meta else None,
                "subjects": meta.get("LoC_subjects") if meta else None,
                "categories": meta.get("Google_categories") if meta else None,
            })

        user["high_rated_books"] = enriched_books

    return users

In [6]:
users = load_users_with_high_rated_books(
    "Users_filtered.jsonl",
    min_rating=7
)

metadata = load_book_metadata("combined_books_with_metadata.jsonl")

users = enrich_books_with_metadata(users, metadata)

In [8]:
import random

def build_random_book_pool(isbn_to_metadata: dict[str, dict]) -> list[dict]:
    """
    Returns a list of books with usable metadata for random sampling.
    """
    pool = []

    for meta in isbn_to_metadata.values():
        if (
            meta.get("Book-Title") is not None
            and meta.get("description") is not None
        ):
            pool.append({
                "isbn": meta.get("ISBN"),
                "title": meta.get("Book-Title"),
                "author": meta.get("Book-Author"),
                "description": meta.get("description"),
                "subjects": meta.get("LoC_subjects"),
                "categories": meta.get("Google_categories"),
            })

    return pool

In [10]:
def select_preference_books(users: list[dict], k: int = 5) -> list[dict]:
    """
    Selects up to k highly rated books with usable metadata
    and stores them as user['preference_books'].
    """

    for user in users:
        # 1. Filter to books with usable metadata
        valid_books = [
            book for book in user.get("high_rated_books", [])
            if (
                book.get("title") is not None
                and book.get("description") is not None
            )
        ]

        # 2. Sort by rating (highest first)
        valid_books.sort(key=lambda b: b.get("rating", 0), reverse=True)

        # 3. Take top k
        user["preference_books"] = valid_books[:k]

    return users

In [11]:
users = select_preference_books(users, k=5)

In [12]:
def get_unused_high_rated_books(user: dict, k: int = 5) -> list[dict]:
    """
    Returns up to k high-rated books that were NOT used as preference context.
    """

    used_isbns = {b["isbn"] for b in user.get("preference_books", [])}

    unused = [
        b for b in user.get("high_rated_books", [])
        if (
            b["isbn"] not in used_isbns
            and b.get("title") is not None
            and b.get("description") is not None
        )
    ]

    unused.sort(key=lambda b: b.get("rating", 0), reverse=True)

    return unused[:k]

In [13]:
def sample_random_books(
    random_pool: list[dict],
    exclude_isbns: set[str],
    k: int = 5
) -> list[dict]:
    """
    Samples k random books, excluding ISBNs already seen by the user.
    """

    candidates = [
        book for book in random_pool
        if book["isbn"] not in exclude_isbns
    ]

    return random.sample(candidates, k)

In [14]:
def build_ranking_candidates(
    user: dict,
    random_pool: list[dict],
    positives_k: int = 5,
    random_k: int = 5
) -> list[dict]:
    """
    Returns 10 books:
    - 5 unused high-rated (positives)
    - 5 random books
    """

    positives = get_unused_high_rated_books(user, positives_k)

    seen_isbns = {
        b["isbn"] for b in user.get("preference_books", [])
    } | {
        b["isbn"] for b in positives
    }

    negatives = sample_random_books(
        random_pool,
        exclude_isbns=seen_isbns,
        k=random_k
    )

    return positives + negatives

In [16]:
import random

def build_ranking_prompt_titles(
    preference_books: list[dict],
    ranking_books: list[dict],
) -> str:
    """
    Builds a prompt for the LLM that gives preference books as context
    and asks it to rank candidate books using titles.
    """

    # Shuffle ranking books to avoid order bias
    shuffled_books = ranking_books[:]
    random.shuffle(shuffled_books)

    # --- Preference context ---
    pref_lines = []
    for i, book in enumerate(preference_books, start=1):
        pref_lines.append(
            f"{i}. {book['title']} by {book['author']}\n"
            #f"   Description: {book['description']}\n"
            #f"   User rating: {book['rating']}/10"
        )

    # --- Ranking candidates ---
    candidate_lines = []
    for book in shuffled_books:
        candidate_lines.append(
            f"- {book['title']} by {book['author']}\n"
            #f"  Description: {book['description']}"
        )

    # --- Final prompt ---
    prompt = f"""
You are a book recommendation system.

USER PREFERENCE CONTEXT (DO NOT RANK)

The following books were rated very highly by the user and are provided
ONLY to infer reading preferences.

- DO NOT recommend, rank, list, or mention these books again.
- These books are NOT candidates.
- They MUST NOT appear in the final ranking.

{chr(10).join(pref_lines)}

Now consider the following candidate books. Rank them from MOST likely to LEAST likely
to be recommended to this user based on the preferences above. Use the book titles
in your ranking.

Candidate books:

{chr(10).join(candidate_lines)}

Instructions:
- Rank all candidate books from best to worst match for this user using their titles
- Base your ranking ONLY on the user's inferred preferences
- Do NOT use popularity or general quality

IMPORTANT CONSTRAINTS:
- Rank ONLY the books listed under "Candidate books"
- DO NOT include preference books
- DO NOT introduce any new books
- Output ONLY a ranked list of candidate book titles

Output format:
Return only a numbered ranked list with the book titles, for example:

1. Book Title 1
2. Book Title 2
3. Book Title 3
...
"""

    return prompt

In [None]:
"""You are a recommendation system.
Your task is to rank books based on how likely a specific user would enjoy them, given their past preferences."""

In [17]:
import ollama

def run_ollama(prompt, model):
    response = ollama.chat(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip()

In [18]:
def format_candidate_books(ranking_books, user):
    ratings_by_isbn = {
        b["isbn"]: b["rating"]
        for b in user.get("high_rated_books", [])
    }

    formatted = []
    for book in ranking_books:
        entry = {"title": book["title"]}
        rating = ratings_by_isbn.get(book["isbn"])
        if rating is not None:
            entry["user_rating"] = rating
        formatted.append(entry)

    return formatted

In [19]:
def run_sample_on_models(users_sample, models):
    """
    Run a sample of users through multiple models and save the outputs.

    Args:
        users_sample: list of user dicts
        models: list of model names (strings)
        prompt_builder: function(user) -> (prompt, candidates)
    Returns:
        results: list of dicts with user_id, model, prompt, llm_output
    """
    results = []

    for user in users_sample:
        preference_books = user.get("preference_books", [])
        ranking_books = build_ranking_candidates(user, random_book_pool)
        prompt = build_ranking_prompt_titles(preference_books, ranking_books)
        candidate_books = format_candidate_books(ranking_books, user)
                        

        for model in models:
            try:
                output = run_ollama(prompt, model)
            except Exception as e:
                output = f"Error: {e}"

            results.append({
                "user_id": user["user_id"],
                "model": model,
                "prompt": prompt,
                "candidate books": candidate_books,
                "llm_output": output
            })

    return results

In [None]:
users_sample = users
models = ['llama3.2:1b', 'mistral:7b', 'gemma3:12b'] #'gpt-oss:20b'

results = run_sample_on_models(
    users_sample,
    models
)

In [None]:
def remove_preference_books_from_output(output, preference_titles):
    lines = output.splitlines()
    return "\n".join(
        line for line in lines
        if not any(title in line for title in preference_titles)
    )

#this is currently not being used

In [21]:
import json

with open("llm_ranking_without_descriptions_all_users_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)