# 01: Data Exploration

This notebook explores the MovieLens 25M dataset to understand:
- Data distributions (users, items, ratings)
- Sparsity of the user-item matrix
- Rating patterns
- Genre distribution
- Data quality issues

In [None]:
# Imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import DataLoader, get_dataset_statistics

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load all data
loader = DataLoader()
data = loader.load_all()

ratings = data['ratings']
metadata = data['metadata']
links = data['links']

## 2. Basic Statistics

In [None]:
# Get statistics
stats = get_dataset_statistics(data)

print("=" * 50)
print("DATASET STATISTICS")
print("=" * 50)
print(f"\nTotal Ratings: {stats['ratings']['total_ratings']:,}")
print(f"Unique Users: {stats['ratings']['unique_users']:,}")
print(f"Unique Movies: {stats['ratings']['unique_movies']:,}")
print(f"\nRating Mean: {stats['ratings']['rating_mean']:.2f}")
print(f"Rating Std: {stats['ratings']['rating_std']:.2f}")
print(f"Matrix Sparsity: {stats['ratings']['sparsity']:.4f}")

## 3. Rating Distribution

In [None]:
# Plot rating distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Rating count distribution
ratings['rating'].value_counts().sort_index().plot(kind='bar', ax=axes[0])
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')
axes[0].set_title('Rating Distribution')
axes[0].set_yscale('log')

# Rating density
ratings['rating'].plot(kind='hist', bins=20, edgecolor='black', ax=axes[1])
axes[1].set_xlabel('Rating')
axes[1].set_ylabel('Density')
axes[1].set_title('Rating Density')

plt.tight_layout()
plt.show()

## 4. User Activity Distribution

In [None]:
# User interaction counts
user_counts = ratings['userId'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution of user activity
user_counts.plot(kind='hist', bins=50, edgecolor='black', ax=axes[0])
axes[0].set_xlabel('Number of Ratings per User')
axes[0].set_ylabel('Number of Users')
axes[0].set_title('User Activity Distribution')
axes[0].set_xscale('log')

# CDF
sorted_counts = np.sort(user_counts)
cdf = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts)
axes[1].plot(sorted_counts, cdf)
axes[1].set_xlabel('Number of Ratings per User')
axes[1].set_ylabel('CDF')
axes[1].set_title('CDF of User Activity')
axes[1].set_xscale('log')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nUser Activity Statistics:")
print(f"  Mean: {user_counts.mean():.1f}")
print(f"  Median: {user_counts.median():.1f}")
print(f"  Min: {user_counts.min()}")
print(f"  Max: {user_counts.max()}")

## 5. Item Popularity Distribution

In [None]:
# Item popularity counts
item_counts = ratings['movieId'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution of item popularity
item_counts.plot(kind='hist', bins=50, edgecolor='black', ax=axes[0])
axes[0].set_xlabel('Number of Ratings per Movie')
axes[0].set_ylabel('Number of Movies')
axes[0].set_title('Movie Popularity Distribution')
axes[0].set_xscale('log')

# CDF
sorted_counts = np.sort(item_counts)
cdf = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts)
axes[1].plot(sorted_counts, cdf)
axes[1].set_xlabel('Number of Ratings per Movie')
axes[1].set_ylabel('CDF')
axes[1].set_title('CDF of Movie Popularity')
axes[1].set_xscale('log')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nMovie Popularity Statistics:")
print(f"  Mean: {item_counts.mean():.1f}")
print(f"  Median: {item_counts.median():.1f}")
print(f"  Min: {item_counts.min()}")
print(f"  Max: {item_counts.max()}")

## 6. Temporal Analysis

In [None]:
# Convert timestamp to datetime
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['year'] = ratings['datetime'].dt.year
ratings['month'] = ratings['datetime'].dt.month

# Plot ratings over time
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Yearly ratings
yearly_counts = ratings.groupby('year').size()
yearly_counts.plot(kind='bar', ax=axes[0])
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Number of Ratings')
axes[0].set_title('Ratings per Year')
axes[0].tick_params(axis='x', rotation=45)

# Monthly ratings
monthly_counts = ratings.groupby('month').size()
monthly_counts.plot(kind='bar', ax=axes[1])
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Number of Ratings')
axes[1].set_title('Ratings per Month')

plt.tight_layout()
plt.show()

## 7. Genre Analysis

In [None]:
from src.data_loader import parse_genres, extract_all_genres

# Extract genres
all_genres = extract_all_genres(metadata)
print(f"\nTotal Genres: {len(all_genres)}")
print(f"Genres: {', '.join(all_genres)}")

# Count movies per genre
metadata['genres_list'] = metadata['genres'].apply(parse_genres)
genre_counts = {}
for genres in metadata['genres_list']:
    for genre in genres:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1

# Plot genre distribution
plt.figure(figsize=(12, 6))
pd.Series(genre_counts).sort_values().plot(kind='barh')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.title('Movies per Genre')
plt.tight_layout()
plt.show()

## Summary

Key findings:
- Dataset has X ratings from Y users and Z movies
- Matrix sparsity: XX%
- Rating distribution: (skewed towards higher ratings)
- User activity follows long-tail distribution
- Movie popularity follows long-tail distribution
- Time range: YYYY to YYYY