In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

# Adjust if needed
PROJECT_ROOT = Path("..").resolve()
DATA_DIR = PROJECT_ROOT / "data"

ratings_path = DATA_DIR / "ratings.csv"
items_path = DATA_DIR / "items.csv"


In [None]:
ratings = pd.read_csv(ratings_path)
items = pd.read_csv(items_path)

ratings.head(), items.head()


In [None]:
print("Ratings info:")
display(ratings.info())
display(ratings.describe())

print("\nItems info:")
display(items.info())
display(items.head())


In [None]:
plt.figure()
ratings["rating"].hist(bins=10)
plt.xlabel("Rating")
plt.ylabel("Count")
plt.title("Rating Distribution")
plt.show()


In [None]:
ratings_per_user = ratings["user_id"].value_counts()
ratings_per_item = ratings["item_id"].value_counts()

print("Ratings per user (head):")
display(ratings_per_user.head())

print("Ratings per item (head):")
display(ratings_per_item.head())


In [None]:
top_items = ratings_per_item.head(10).reset_index()
top_items.columns = ["item_id", "rating_count"]

top_items = top_items.merge(items, on="item_id", how="left")

top_items[["item_id", "title", "rating_count"]]


In [None]:
user_item_matrix = ratings.pivot_table(
    index="user_id",
    columns="item_id",
    values="rating"
)

num_users, num_items = user_item_matrix.shape
num_possible = num_users * num_items
num_actual = ratings.shape[0]

sparsity = 1 - (num_actual / num_possible)

print(f"Users: {num_users}")
print(f"Items: {num_items}")
print(f"Possible interactions: {num_possible}")
print(f"Actual interactions: {num_actual}")
print(f"Sparsity: {sparsity:.4f} (fraction of missing ratings)")


In [None]:
# Week 1 – EDA Summary

- Total users: …
- Total items: …
- Average rating: …
- Ratings are mostly in the range …
- Data sparsity is about … (very sparse → good candidate for CF + MF)
- Top 10 items by rating count are …
