# ðŸŽ¬ Movie Recommendation System â€” Notebook
**Author:** B. Vikas | AIML Student, JAIN University

This notebook walks through:
1. Data loading & exploration (EDA)
2. Feature engineering (tags column)
3. Vectorization with CountVectorizer
4. Cosine Similarity computation
5. Generating recommendations
6. Saving artifacts for the Streamlit app

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)
print('Libraries loaded âœ“')

## 1. Load & Merge Datasets

In [None]:
movies  = pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv')

credits.rename(columns={'movie_id': 'id'}, inplace=True)
df = movies.merge(credits, on='id')

print(f'Shape after merge: {df.shape}')
df.head(3)

## 2. EDA â€” Exploratory Data Analysis

In [None]:
print('Missing values:\n')
print(df.isnull().sum())

In [None]:
# Distribution of vote averages
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(df['vote_average'].dropna(), bins=30, color='#E50914', edgecolor='black')
axes[0].set_title('Distribution of Vote Averages')
axes[0].set_xlabel('Vote Average')
axes[0].set_ylabel('Count')

axes[1].hist(df['vote_count'].dropna(), bins=30, color='#221f1f', edgecolor='gray')
axes[1].set_title('Distribution of Vote Counts')
axes[1].set_xlabel('Vote Count')

plt.tight_layout()
plt.savefig('../assets/eda_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top 10 genres
import ast

all_genres = []
for g in df['genres'].dropna():
    try:
        all_genres.extend([x['name'] for x in ast.literal_eval(g)])
    except:
        pass

genre_counts = pd.Series(all_genres).value_counts().head(10)

plt.figure(figsize=(10, 4))
genre_counts.plot(kind='bar', color='#E50914', edgecolor='black')
plt.title('Top 10 Movie Genres in Dataset')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../assets/top_genres.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Feature Engineering

In [None]:
from recommender import (
    load_and_merge, build_tags
)

df_raw = load_and_merge('../data/tmdb_5000_movies.csv', '../data/tmdb_5000_credits.csv')
df_feat = build_tags(df_raw)

print(f'Feature dataset shape: {df_feat.shape}')
df_feat.head()

In [None]:
# Example: inspect tags for a well-known movie
mask = df_feat['title'] == 'The Dark Knight'
print('Tags for The Dark Knight:\n')
print(df_feat[mask]['tags'].values[0])

## 4. CountVectorizer + Cosine Similarity

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df_feat['tags']).toarray()

print(f'Vocabulary size : {len(cv.vocabulary_)}')
print(f'Vector shape    : {vectors.shape}')

In [None]:
similarity = cosine_similarity(vectors)
print(f'Similarity matrix shape: {similarity.shape}')
print(f'Sample similarity (row 0):\n{similarity[0][:10]}')

## 5. Generate Recommendations

In [None]:
from recommender import MovieRecommender

rec = MovieRecommender()
rec.fit('../data/tmdb_5000_movies.csv', '../data/tmdb_5000_credits.csv')

# Test recommendations
for test_movie in ['The Dark Knight', 'Avatar', 'Inception']:
    print(f'\nðŸŽ¬ Recommendations for "{test_movie}":')
    recs = rec.recommend(test_movie, n=5)
    for _, row in recs.iterrows():
        print(f"  â†’ {row['title']} (similarity: {row['similarity_score']:.3f}, â˜…{row['vote_average']})")

## 6. Save Artifacts

In [None]:
rec.save('../artifacts')
print('\nArtifacts saved! Now run: streamlit run app.py')