# Movie Recommendation System - Data Exploration

This notebook explores the MovieLens dataset and provides insights into movie ratings, user behavior, and content characteristics.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import our custom modules
import sys
sys.path.append('../')
from src.data_loader import MovieDataLoader
from src.utils import extract_year_from_title, parse_genres, get_unique_genres

In [None]:
# Load data
data_loader = MovieDataLoader()
movies_df, ratings_df, links_df = data_loader.load_data()

print(f"Dataset Overview:")
print(f"Movies: {len(movies_df):,}")
print(f"Ratings: {len(ratings_df):,}")
print(f"Users: {ratings_df['userId'].nunique():,}")
print(f"Links: {len(links_df):,}")

## 2. Dataset Overview

In [None]:
# Display sample data
print("Movies Dataset Sample:")
display(movies_df.head())

print("\nRatings Dataset Sample:")
display(ratings_df.head())

print("\nLinks Dataset Sample:")
display(links_df.head())

In [None]:
# Dataset info
print("Movies Dataset Info:")
print(movies_df.info())
print(f"\nMissing values: {movies_df.isnull().sum().sum()}")

print("\nRatings Dataset Info:")
print(ratings_df.info())
print(f"\nMissing values: {ratings_df.isnull().sum().sum()}")

## 3. Rating Distribution Analysis

In [None]:
# Rating distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Rating counts
rating_counts = ratings_df['rating'].value_counts().sort_index()
axes[0].bar(rating_counts.index, rating_counts.values, color='skyblue')
axes[0].set_title('Distribution of Ratings')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Rating statistics
rating_stats = ratings_df['rating'].describe()
axes[1].text(0.1, 0.9, f"Mean Rating: {rating_stats['mean']:.2f}", transform=axes[1].transAxes, fontsize=12)
axes[1].text(0.1, 0.8, f"Median Rating: {rating_stats['50%']:.2f}", transform=axes[1].transAxes, fontsize=12)
axes[1].text(0.1, 0.7, f"Std Deviation: {rating_stats['std']:.2f}", transform=axes[1].transAxes, fontsize=12)
axes[1].text(0.1, 0.6, f"Min Rating: {rating_stats['min']:.2f}", transform=axes[1].transAxes, fontsize=12)
axes[1].text(0.1, 0.5, f"Max Rating: {rating_stats['max']:.2f}", transform=axes[1].transAxes, fontsize=12)
axes[1].set_title('Rating Statistics')
axes[1].set_xticks([])
axes[1].set_yticks([])

plt.tight_layout()
plt.show()

print(f"Rating Distribution:")
for rating, count in rating_counts.items():
    percentage = (count / len(ratings_df)) * 100
    print(f"Rating {rating}: {count:,} ({percentage:.1f}%)")

## 4. Genre Analysis

In [None]:
# Extract all unique genres
all_genres = get_unique_genres(movies_df)
print(f"Total unique genres: {len(all_genres)}")
print(f"Genres: {', '.join(all_genres)}")

# Count movies per genre
genre_counts = {}
for _, movie in movies_df.iterrows():
    genres = parse_genres(movie['genres'])
    for genre in genres:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1

# Create DataFrame for visualization
genre_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count'])
genre_df = genre_df.sort_values('Count', ascending=False)

# Plot genre distribution
plt.figure(figsize=(12, 8))
sns.barplot(data=genre_df, x='Count', y='Genre', palette='viridis')
plt.title('Number of Movies by Genre')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nTop 10 Genres by Movie Count:")
for i, (genre, count) in enumerate(genre_df.head(10).values, 1):
    print(f"{i:2d}. {genre}: {count} movies")

In [None]:
# Average rating by genre
genre_ratings = {}
genre_rating_counts = {}

# Merge ratings with movies to get genres
ratings_with_movies = ratings_df.merge(movies_df, on='movieId')

for _, row in ratings_with_movies.iterrows():
    genres = parse_genres(row['genres'])
    for genre in genres:
        if genre not in genre_ratings:
            genre_ratings[genre] = []
        genre_ratings[genre].append(row['rating'])

# Calculate average ratings
genre_avg_ratings = {}
for genre, ratings in genre_ratings.items():
    genre_avg_ratings[genre] = {
        'avg_rating': np.mean(ratings),
        'rating_count': len(ratings),
        'std_rating': np.std(ratings)
    }

# Create DataFrame
genre_stats_df = pd.DataFrame.from_dict(genre_avg_ratings, orient='index')
genre_stats_df = genre_stats_df.sort_values('avg_rating', ascending=False)

# Plot average ratings by genre
plt.figure(figsize=(12, 8))
sns.barplot(data=genre_stats_df.reset_index(), x='avg_rating', y='index', palette='coolwarm')
plt.title('Average Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nGenres with Highest Average Ratings:")
for i, (genre, stats) in enumerate(genre_stats_df.head(10).iterrows(), 1):
    print(f"{i:2d}. {genre}: {stats['avg_rating']:.2f} ({stats['rating_count']} ratings)")

## 5. Movie Release Year Analysis

In [None]:
# Extract years from movie titles
movies_with_years = movies_df.copy()
movies_with_years[['clean_title', 'year']] = movies_with_years['title'].apply(
    lambda x: pd.Series(extract_year_from_title(x))
)

# Filter out movies without years
movies_with_years = movies_with_years[movies_with_years['year'].notna()]

print(f"Movies with year information: {len(movies_with_years)} out of {len(movies_df)}")

if not movies_with_years.empty:
    # Year distribution
    year_counts = movies_with_years['year'].value_counts().sort_index()
    
    plt.figure(figsize=(15, 6))
    plt.plot(year_counts.index, year_counts.values, marker='o', linewidth=2, markersize=4)
    plt.title('Number of Movies by Release Year')
    plt.xlabel('Release Year')
    plt.ylabel('Number of Movies')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"\nYear Range: {movies_with_years['year'].min():.0f} - {movies_with_years['year'].max():.0f}")
    print(f"Most productive years:")
    for year, count in year_counts.tail(5).items():
        print(f"  {year:.0f}: {count} movies")
else:
    print("No year information available in the dataset.")

## 6. User Behavior Analysis

In [None]:
# User rating statistics
user_stats = ratings_df.groupby('userId').agg({
    'rating': ['count', 'mean', 'std']
})
user_stats.columns = ['rating_count', 'avg_rating', 'rating_std']
user_stats = user_stats.fillna(0)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Number of ratings per user
axes[0,0].hist(user_stats['rating_count'], bins=30, edgecolor='black', alpha=0.7)
axes[0,0].set_title('Distribution of Ratings per User')
axes[0,0].set_xlabel('Number of Ratings')
axes[0,0].set_ylabel('Number of Users')
axes[0,0].grid(axis='y', alpha=0.3)

# Average rating per user
axes[0,1].hist(user_stats['avg_rating'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0,1].set_title('Distribution of Average Ratings per User')
axes[0,1].set_xlabel('Average Rating')
axes[0,1].set_ylabel('Number of Users')
axes[0,1].grid(axis='y', alpha=0.3)

# Scatter plot: rating count vs average rating
axes[1,0].scatter(user_stats['rating_count'], user_stats['avg_rating'], alpha=0.6)
axes[1,0].set_title('Rating Count vs Average Rating')
axes[1,0].set_xlabel('Number of Ratings')
axes[1,0].set_ylabel('Average Rating')
axes[1,0].grid(True, alpha=0.3)

# User activity summary
activity_stats = {
    'Total Users': len(user_stats),
    'Avg Ratings per User': user_stats['rating_count'].mean(),
    'Most Active User': user_stats['rating_count'].max(),
    'Least Active User': user_stats['rating_count'].min(),
    'Avg User Rating': user_stats['avg_rating'].mean()
}

y_pos = 0.9
for key, value in activity_stats.items():
    if isinstance(value, float):
        axes[1,1].text(0.1, y_pos, f"{key}: {value:.2f}", transform=axes[1,1].transAxes, fontsize=12)
    else:
        axes[1,1].text(0.1, y_pos, f"{key}: {value}", transform=axes[1,1].transAxes, fontsize=12)
    y_pos -= 0.15

axes[1,1].set_title('User Activity Statistics')
axes[1,1].set_xticks([])
axes[1,1].set_yticks([])

plt.tight_layout()
plt.show()

print(f"\nUser Behavior Summary:")
print(f"Total users: {len(user_stats):,}")
print(f"Average ratings per user: {user_stats['rating_count'].mean():.1f}")
print(f"Median ratings per user: {user_stats['rating_count'].median():.1f}")
print(f"Most active user: {user_stats['rating_count'].max()} ratings")
print(f"Users with 10+ ratings: {(user_stats['rating_count'] >= 10).sum():,}")
print(f"Users with 20+ ratings: {(user_stats['rating_count'] >= 20).sum():,}")

## 7. Movie Popularity Analysis

In [None]:
# Movie rating statistics
movie_stats = ratings_df.groupby('movieId').agg({
    'rating': ['count', 'mean', 'std']
})
movie_stats.columns = ['rating_count', 'avg_rating', 'rating_std']
movie_stats = movie_stats.fillna(0)

# Merge with movie titles
movie_stats_with_titles = movie_stats.merge(
    movies_df[['movieId', 'title', 'genres']], 
    left_index=True, 
    right_on='movieId'
).set_index('movieId')

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Number of ratings per movie
axes[0,0].hist(movie_stats['rating_count'], bins=30, edgecolor='black', alpha=0.7)
axes[0,0].set_title('Distribution of Ratings per Movie')
axes[0,0].set_xlabel('Number of Ratings')
axes[0,0].set_ylabel('Number of Movies')
axes[0,0].set_yscale('log')  # Log scale due to long tail
axes[0,0].grid(axis='y', alpha=0.3)

# Average rating per movie
axes[0,1].hist(movie_stats['avg_rating'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0,1].set_title('Distribution of Average Ratings per Movie')
axes[0,1].set_xlabel('Average Rating')
axes[0,1].set_ylabel('Number of Movies')
axes[0,1].grid(axis='y', alpha=0.3)

# Scatter plot: rating count vs average rating
axes[1,0].scatter(movie_stats['rating_count'], movie_stats['avg_rating'], alpha=0.6)
axes[1,0].set_title('Rating Count vs Average Rating (Movies)')
axes[1,0].set_xlabel('Number of Ratings')
axes[1,0].set_ylabel('Average Rating')
axes[1,0].grid(True, alpha=0.3)

# Most popular movies
most_popular = movie_stats_with_titles.nlargest(10, 'rating_count')
axes[1,1].barh(range(len(most_popular)), most_popular['rating_count'])
axes[1,1].set_yticks(range(len(most_popular)))
axes[1,1].set_yticklabels([title[:30] + '...' if len(title) > 30 else title 
                          for title in most_popular['title']], fontsize=10)
axes[1,1].set_title('Top 10 Most Rated Movies')
axes[1,1].set_xlabel('Number of Ratings')

plt.tight_layout()
plt.show()

print(f"\nMovie Popularity Summary:")
print(f"Total movies: {len(movie_stats):,}")
print(f"Average ratings per movie: {movie_stats['rating_count'].mean():.1f}")
print(f"Median ratings per movie: {movie_stats['rating_count'].median():.1f}")
print(f"Most rated movie: {movie_stats['rating_count'].max()} ratings")
print(f"Movies with 10+ ratings: {(movie_stats['rating_count'] >= 10).sum():,}")
print(f"Movies with only 1 rating: {(movie_stats['rating_count'] == 1).sum():,}")

print(f"\nTop 5 Most Rated Movies:")
for i, (_, movie) in enumerate(most_popular.head().iterrows(), 1):
    print(f"{i}. {movie['title']} - {movie['rating_count']} ratings (avg: {movie['avg_rating']:.2f})")

## 8. User-Item Matrix Sparsity Analysis

In [None]:
# Calculate sparsity
n_users = ratings_df['userId'].nunique()
n_movies = ratings_df['movieId'].nunique()
n_ratings = len(ratings_df)

total_possible_ratings = n_users * n_movies
sparsity = 1 - (n_ratings / total_possible_ratings)

print(f"User-Item Matrix Sparsity Analysis:")
print(f"Number of users: {n_users:,}")
print(f"Number of movies: {n_movies:,}")
print(f"Number of ratings: {n_ratings:,}")
print(f"Total possible ratings: {total_possible_ratings:,}")
print(f"Matrix sparsity: {sparsity:.2%}")
print(f"Matrix density: {1-sparsity:.2%}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Sparsity visualization
labels = ['Rated', 'Not Rated']
sizes = [n_ratings, total_possible_ratings - n_ratings]
colors = ['lightcoral', 'lightblue']

axes[0].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
axes[0].set_title('User-Item Matrix Density')

# Sample of user-item interactions (for visualization)
sample_users = ratings_df['userId'].unique()[:20]  # First 20 users
sample_movies = ratings_df['movieId'].unique()[:20]  # First 20 movies

# Create sample matrix
sample_matrix = ratings_df[
    (ratings_df['userId'].isin(sample_users)) & 
    (ratings_df['movieId'].isin(sample_movies))
].pivot(index='userId', columns='movieId', values='rating')

# Fill NaN with 0 for visualization
sample_matrix_filled = sample_matrix.fillna(0)

im = axes[1].imshow(sample_matrix_filled, cmap='YlOrRd', aspect='auto')
axes[1].set_title('Sample User-Movie Rating Matrix (20x20)')
axes[1].set_xlabel('Movie ID (sample)')
axes[1].set_ylabel('User ID (sample)')
plt.colorbar(im, ax=axes[1], label='Rating')

plt.tight_layout()
plt.show()

## 9. Data Quality Assessment

In [None]:
print("Data Quality Assessment:")
print("=" * 50)

# Missing values
print("\n1. Missing Values:")
print(f"Movies dataset: {movies_df.isnull().sum().sum()} missing values")
print(f"Ratings dataset: {ratings_df.isnull().sum().sum()} missing values")
print(f"Links dataset: {links_df.isnull().sum().sum()} missing values")

# Duplicate ratings
print("\n2. Duplicate Ratings:")
duplicate_ratings = ratings_df.duplicated(subset=['userId', 'movieId']).sum()
print(f"Duplicate user-movie pairs: {duplicate_ratings}")

# Rating range validation
print("\n3. Rating Range Validation:")
invalid_ratings = ratings_df[(ratings_df['rating'] < 0.5) | (ratings_df['rating'] > 5.0)]
print(f"Invalid ratings (not in 0.5-5.0 range): {len(invalid_ratings)}")

# Movies without ratings
print("\n4. Movies without Ratings:")
movies_with_ratings = set(ratings_df['movieId'])
all_movies = set(movies_df['movieId'])
movies_without_ratings = all_movies - movies_with_ratings
print(f"Movies without any ratings: {len(movies_without_ratings)}")

# Users with single rating
print("\n5. Single-Rating Users:")
user_rating_counts = ratings_df['userId'].value_counts()
single_rating_users = (user_rating_counts == 1).sum()
print(f"Users with only one rating: {single_rating_users}")

# Genre information
print("\n6. Genre Information:")
movies_without_genres = movies_df[movies_df['genres'].isnull() | (movies_df['genres'] == '')]
print(f"Movies without genre information: {len(movies_without_genres)}")

# Timestamp validation
print("\n7. Timestamp Information:")
print(f"Earliest timestamp: {pd.to_datetime(ratings_df['timestamp'], unit='s').min()}")
print(f"Latest timestamp: {pd.to_datetime(ratings_df['timestamp'], unit='s').max()}")

# Summary
print("\n" + "=" * 50)
print("SUMMARY:")
total_issues = (
    movies_df.isnull().sum().sum() + 
    ratings_df.isnull().sum().sum() + 
    duplicate_ratings + 
    len(invalid_ratings) + 
    len(movies_without_genres)
)
print(f"Total data quality issues found: {total_issues}")

if total_issues == 0:
    print("✅ Dataset appears to be clean and ready for analysis!")
else:
    print("⚠️  Some data quality issues detected. Consider preprocessing.")

## 10. Key Insights and Recommendations

In [None]:
print("KEY INSIGHTS FROM DATA EXPLORATION:")
print("=" * 60)

# Calculate key metrics
avg_rating = ratings_df['rating'].mean()
rating_std = ratings_df['rating'].std()
most_common_rating = ratings_df['rating'].mode()[0]
sparsity_percent = sparsity * 100

print(f"\n📊 DATASET CHARACTERISTICS:")
print(f"   • Dataset contains {len(movies_df):,} movies and {len(ratings_df):,} ratings")
print(f"   • {n_users:,} unique users with average of {user_stats['rating_count'].mean():.1f} ratings each")
print(f"   • User-item matrix is {sparsity_percent:.1f}% sparse")

print(f"\n⭐ RATING PATTERNS:")
print(f"   • Average rating: {avg_rating:.2f} (std: {rating_std:.2f})")
print(f"   • Most common rating: {most_common_rating}")
print(f"   • Rating distribution skews towards higher ratings")

print(f"\n🎬 MOVIE CHARACTERISTICS:")
print(f"   • {len(all_genres)} unique genres identified")
print(f"   • Most popular genre: {genre_df.iloc[0]['Genre']} ({genre_df.iloc[0]['Count']} movies)")
print(f"   • Average movie receives {movie_stats['rating_count'].mean():.1f} ratings")

if not movies_with_years.empty:
    print(f"   • Movie years range from {movies_with_years['year'].min():.0f} to {movies_with_years['year'].max():.0f}")

print(f"\n🔍 RECOMMENDATION SYSTEM IMPLICATIONS:")
print(f"   • High sparsity suggests collaborative filtering may be challenging")
print(f"   • Content-based filtering viable with rich genre information")
print(f"   • Hybrid approach recommended to handle cold start problems")
print(f"   • Popular movies could serve as good defaults for new users")

print(f"\n💡 PREPROCESSING RECOMMENDATIONS:")
print(f"   • Consider filtering movies with < 10 ratings for collaborative filtering")
print(f"   • Users with < 5 ratings may need content-based or popularity-based recommendations")
print(f"   • Genre information is rich and suitable for content-based filtering")
print(f"   • High rating bias (towards positive) should be considered in evaluation")

print("\n" + "=" * 60)
print("✅ Data exploration complete! Ready for model development.")