# MovieLens-100K Baseline

This notebook loads the MovieLens-100K dataset, does quick EDA, and performs a basic train/test split.

In [None]:
# Google Colab setup: clone repo and install dependencies
# Copy/paste this cell in a fresh Colab runtime.
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'

if IN_COLAB:
    import os, sys
    if not os.path.exists(repo_dir):
        !git clone {repo_url}
    %cd {repo_dir}
    !pip -q install -r movie_lens_100k/requirements.txt
    src_path = os.path.abspath('movie_lens_100k/src')
    if src_path not in sys.path:
        sys.path.insert(0, src_path)

In [None]:
# If running in Colab, you may need to install dependencies:
# !pip install pandas numpy scikit-learn
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os, sys
# Ensure we can import from src/ when running the notebook directly
repo_root = os.path.abspath(os.path.join(os.getcwd()))
src_path = os.path.join(repo_root, 'movie_lens_100k', 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)
from data.movielens import load_movielens_100k


In [None]:
# Load dataset (downloads on first run)
df = load_movielens_100k()
df.head()

In [None]:
# Shape and basic info
print('Shape:', df.shape)
print('Columns:', list(df.columns))
print('Rating stats:')
print(df['rating'].describe())


In [None]:
# Ratings histogram
df['rating'].plot(kind='hist', bins=5, edgecolor='black', title='Ratings Histogram')
plt.xlabel('Rating')
plt.show()


In [None]:
# Train/test split (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
print('Train rating mean:', train_df['rating'].mean())
print('Test rating mean:', test_df['rating'].mean())


In [None]:
# Popularity baseline: Top-N by interaction count
from models.popularity import get_top_n
top10 = get_top_n(train_df, n=10)
top10

In [None]:
# Inspect counts for the top-N movies
counts = train_df['movie_id'].value_counts()
counts.loc[top10]