# ðŸŽ¬ Movie Mood Recommender - Data Exploration

This notebook explores the collected movie and review data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Data paths
DATA_DIR = Path('../data')
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

## 1. Load Data

In [None]:
# Load movies
movies_path = PROCESSED_DIR / 'movies.parquet'
if movies_path.exists():
    movies_df = pd.read_parquet(movies_path)
    print(f'Loaded {len(movies_df)} movies')
    display(movies_df.head())
else:
    print('Movies file not found. Run the data collection pipeline first.')

In [None]:
# Load reviews
reviews_path = PROCESSED_DIR / 'reviews.parquet'
if reviews_path.exists():
    reviews_df = pd.read_parquet(reviews_path)
    print(f'Loaded {len(reviews_df)} reviews')
    display(reviews_df.head())
else:
    print('Reviews file not found.')

## 2. Movie Analysis

In [None]:
# Genre distribution
if 'movies_df' in dir():
    all_genres = []
    for genres in movies_df['genres'].dropna():
        if isinstance(genres, list):
            all_genres.extend(genres)
    
    genre_counts = pd.Series(all_genres).value_counts().head(15)
    
    plt.figure(figsize=(12, 6))
    genre_counts.plot(kind='barh', color=sns.color_palette('husl', 15))
    plt.xlabel('Number of Movies')
    plt.title('Top 15 Genres in Dataset')
    plt.tight_layout()
    plt.show()

In [None]:
# Rating distribution
if 'movies_df' in dir() and 'tmdb_rating' in movies_df.columns:
    plt.figure(figsize=(10, 5))
    movies_df['tmdb_rating'].hist(bins=20, edgecolor='white')
    plt.xlabel('TMDB Rating')
    plt.ylabel('Count')
    plt.title('Distribution of Movie Ratings')
    plt.show()

## 3. Review Analysis

In [None]:
# Review length distribution
if 'reviews_df' in dir() and len(reviews_df) > 0:
    reviews_df['content_length'] = reviews_df['content'].str.len()
    
    plt.figure(figsize=(10, 5))
    reviews_df['content_length'].hist(bins=50, edgecolor='white')
    plt.xlabel('Review Length (characters)')
    plt.ylabel('Count')
    plt.title('Distribution of Review Lengths')
    plt.xlim(0, 5000)
    plt.show()
    
    print(f"Average review length: {reviews_df['content_length'].mean():.0f} characters")

## 4. Sample Reviews

In [None]:
# Display sample reviews
if 'reviews_df' in dir() and len(reviews_df) > 0:
    sample = reviews_df.sample(3)
    for _, row in sample.iterrows():
        print('=' * 60)
        print(f"Rating: {row.get('rating', 'N/A')}")
        print(f"Content: {row['content'][:500]}...")
        print()

## Next Steps

In **Phase 2**, we will:
1. Run emotion detection on all reviews
2. Create emotion profiles for each movie
3. Build the emotional arc analysis