# Notebook 01: Data Preparation
## Hệ thống Recommendation Phim

**Mục tiêu:**
1. Load và kiểm tra dữ liệu CSV (movies.csv, ratings.csv)
2. Xử lý missing values, duplicates, outliers
3. Feature engineering (parse genres, extract year)
4. Vector hóa với TF-IDF
5. Import dữ liệu vào MongoDB

---

## 1. Import Required Libraries

In [2]:
# Import standard libraries
import os
import sys
import re
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# MongoDB
from pymongo import MongoClient, UpdateOne
from pymongo.errors import BulkWriteError

# Progress bar
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings('ignore')

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.getcwd()))

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.3.3
NumPy version: 2.3.5


## 2. Configuration

In [3]:
# Configuration
DATA_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"
MODELS_DIR = "../models_saved"

# MongoDB settings
MONGO_URI = "mongodb://localhost:27017"
DB_NAME = "movie_recommendation"

# File paths
MOVIES_PATH = os.path.join(DATA_DIR, "movies.csv")
RATINGS_PATH = os.path.join(DATA_DIR, "ratings.csv")

# Create directories if not exist
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Processed directory: {PROCESSED_DIR}")
print(f"Models directory: {MODELS_DIR}")
print(f"MongoDB URI: {MONGO_URI}")
print(f"Database: {DB_NAME}")

Data directory: ../data/raw
Processed directory: ../data/processed
Models directory: ../models_saved
MongoDB URI: mongodb://localhost:27017
Database: movie_recommendation


## 3. Load Raw Data from CSV Files

In [4]:
# Load movies data
print("Loading movies.csv...")
movies_df = pd.read_csv(MOVIES_PATH)
print(f"Movies loaded: {len(movies_df):,} records")

# Load ratings data
print("\nLoading ratings.csv...")
ratings_df = pd.read_csv(RATINGS_PATH)
print(f"Ratings loaded: {len(ratings_df):,} records")

Loading movies.csv...
Movies loaded: 62,423 records

Loading ratings.csv...
Ratings loaded: 25,000,095 records


In [5]:
# Preview movies data
print("Movies DataFrame - First 5 rows:")
movies_df.head()

Movies DataFrame - First 5 rows:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Preview ratings data
print("Ratings DataFrame - First 5 rows:")
ratings_df.head()

Ratings DataFrame - First 5 rows:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


## 4. Inspect Dataset Information

In [7]:
# Movies DataFrame Info
print("=" * 60)
print("MOVIES DATASET INFO")
print("=" * 60)
print(f"\nShape: {movies_df.shape[0]:,} rows x {movies_df.shape[1]} columns")
print(f"\nColumns: {list(movies_df.columns)}")
print(f"\nData Types:")
print(movies_df.dtypes)
print(f"\nBasic Statistics:")
movies_df.describe(include='all')

MOVIES DATASET INFO

Shape: 62,423 rows x 3 columns

Columns: ['movieId', 'title', 'genres']

Data Types:
movieId     int64
title      object
genres     object
dtype: object

Basic Statistics:


Unnamed: 0,movieId,title,genres
count,62423.0,62423,62423
unique,,62325,1639
top,,Weekend (2011),Drama
freq,,2,9056
mean,122220.387646,,
std,63264.744844,,
min,1.0,,
25%,82146.5,,
50%,138022.0,,
75%,173222.0,,


In [8]:
# Ratings DataFrame Info
print("=" * 60)
print("RATINGS DATASET INFO")
print("=" * 60)
print(f"\nShape: {ratings_df.shape[0]:,} rows x {ratings_df.shape[1]} columns")
print(f"\nColumns: {list(ratings_df.columns)}")
print(f"\nData Types:")
print(ratings_df.dtypes)
print(f"\nBasic Statistics:")
ratings_df.describe()

RATINGS DATASET INFO

Shape: 25,000,095 rows x 4 columns

Columns: ['userId', 'movieId', 'rating', 'timestamp']

Data Types:
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

Basic Statistics:


Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


## 5. Handle Missing Values

In [9]:
# Check missing values
print("Missing Values Analysis")
print("=" * 60)

print("\nMovies - Missing Values:")
movies_missing = movies_df.isnull().sum()
print(movies_missing[movies_missing > 0] if movies_missing.sum() > 0 else "No missing values")

print("\nRatings - Missing Values:")
ratings_missing = ratings_df.isnull().sum()
print(ratings_missing[ratings_missing > 0] if ratings_missing.sum() > 0 else "No missing values")

Missing Values Analysis

Movies - Missing Values:
No missing values

Ratings - Missing Values:
No missing values


In [10]:
# Handle missing values in movies
print("Handling Missing Values...")
original_movies_count = len(movies_df)

# Fill missing genres with '(no genres listed)'
movies_df['genres'] = movies_df['genres'].fillna('(no genres listed)')

# Drop rows with missing movieId or title
movies_df = movies_df.dropna(subset=['movieId', 'title'])

print(f"Movies: {original_movies_count:,} -> {len(movies_df):,} (removed {original_movies_count - len(movies_df):,})")

# Handle missing values in ratings
original_ratings_count = len(ratings_df)

# Drop rows with missing userId, movieId, or rating
ratings_df = ratings_df.dropna(subset=['userId', 'movieId', 'rating'])

print(f"Ratings: {original_ratings_count:,} -> {len(ratings_df):,} (removed {original_ratings_count - len(ratings_df):,})")
print("\nMissing values handled!")

Handling Missing Values...
Movies: 62,423 -> 62,423 (removed 0)
Ratings: 25,000,095 -> 25,000,095 (removed 0)

Missing values handled!


## 6. Remove Duplicate Records

In [11]:
# Check and remove duplicates
print("Checking for Duplicates...")
print("=" * 60)

# Movies duplicates (by movieId)
movies_dups = movies_df.duplicated(subset=['movieId']).sum()
print(f"\nMovies - Duplicate movieIds: {movies_dups:,}")

# Ratings duplicates (by userId + movieId)
ratings_dups = ratings_df.duplicated(subset=['userId', 'movieId']).sum()
print(f"Ratings - Duplicate (userId, movieId) pairs: {ratings_dups:,}")

Checking for Duplicates...

Movies - Duplicate movieIds: 0
Ratings - Duplicate (userId, movieId) pairs: 0


In [12]:
# Remove duplicates
print("Removing Duplicates...")

# Remove duplicate movies (keep first)
before_movies = len(movies_df)
movies_df = movies_df.drop_duplicates(subset=['movieId'], keep='first')
print(f"Movies: {before_movies:,} -> {len(movies_df):,} (removed {before_movies - len(movies_df):,})")

# Remove duplicate ratings (keep last - most recent rating)
before_ratings = len(ratings_df)
ratings_df = ratings_df.drop_duplicates(subset=['userId', 'movieId'], keep='last')
print(f"Ratings: {before_ratings:,} -> {len(ratings_df):,} (removed {before_ratings - len(ratings_df):,})")

print("\nDuplicates removed!")

Removing Duplicates...
Movies: 62,423 -> 62,423 (removed 0)
Ratings: 25,000,095 -> 25,000,095 (removed 0)

Duplicates removed!


## 7. Handle Outliers in Ratings

In [13]:
# Check rating distribution and outliers
print("Rating Distribution Analysis")
print("=" * 60)

print(f"\nRating range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"\nRating value counts:")
print(ratings_df['rating'].value_counts().sort_index())

# Check for outliers (ratings outside 0.5-5.0)
outliers_low = (ratings_df['rating'] < 0.5).sum()
outliers_high = (ratings_df['rating'] > 5.0).sum()
print(f"\nOutliers below 0.5: {outliers_low:,}")
print(f"Outliers above 5.0: {outliers_high:,}")

Rating Distribution Analysis

Rating range: 0.5 - 5.0

Rating value counts:
rating
0.5     393068
1.0     776815
1.5     399490
2.0    1640868
2.5    1262797
3.0    4896928
3.5    3177318
4.0    6639798
4.5    2200539
5.0    3612474
Name: count, dtype: int64

Outliers below 0.5: 0
Outliers above 5.0: 0


In [14]:
# Handle outliers - clip ratings to valid range [0.5, 5.0]
print("Handling Rating Outliers...")

before_outliers = len(ratings_df)

# Filter ratings within valid range
ratings_df = ratings_df[(ratings_df['rating'] >= 0.5) & (ratings_df['rating'] <= 5.0)]

print(f"Ratings: {before_outliers:,} -> {len(ratings_df):,} (removed {before_outliers - len(ratings_df):,})")
print(f"\nRating range now: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")

Handling Rating Outliers...
Ratings: 25,000,095 -> 25,000,095 (removed 0)

Rating range now: 0.5 - 5.0


## 8. Feature Engineering - Parse Genres

In [15]:
# Parse genres from string to list
def parse_genres(genres_str):
    """Parse genres string into list."""
    if pd.isna(genres_str) or genres_str == '(no genres listed)':
        return []
    return [g.strip() for g in genres_str.split('|') if g.strip()]

print("Parsing Genres...")

# Store original genres string for TF-IDF
movies_df['genres_str'] = movies_df['genres']

# Create genres list column
movies_df['genres'] = movies_df['genres_str'].apply(parse_genres)

print("Genres parsed successfully!")
print(f"\nSample genres:")
movies_df[['movieId', 'title', 'genres']].head(10)

Parsing Genres...
Genres parsed successfully!

Sample genres:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"
6,7,Sabrina (1995),"[Comedy, Romance]"
7,8,Tom and Huck (1995),"[Adventure, Children]"
8,9,Sudden Death (1995),[Action]
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]"


In [16]:
# Get all unique genres
all_genres = set()
for genres in movies_df['genres']:
    all_genres.update(genres)

print(f"Total unique genres: {len(all_genres)}")
print(f"\nAll genres:")
print(sorted(all_genres))

Total unique genres: 19

All genres:
['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


## 9. Feature Engineering - Extract Year from Title

In [17]:
# Extract year from movie title
def extract_year_from_title(title):
    """Extract year from title like 'Toy Story (1995)'."""
    match = re.search(r'\((\d{4})\)\s*$', str(title))
    if match:
        return int(match.group(1))
    return None

def clean_title(title):
    """Remove year from title."""
    return re.sub(r'\s*\(\d{4}\)\s*$', '', str(title)).strip()

print("Extracting Year from Titles...")

# Extract year
movies_df['year'] = movies_df['title'].apply(extract_year_from_title)

# Create clean title (without year)
movies_df['cleanTitle'] = movies_df['title'].apply(clean_title)

print("Year extracted successfully!")
print(f"\nMovies with year: {movies_df['year'].notna().sum():,}")
print(f"Movies without year: {movies_df['year'].isna().sum():,}")
print(f"\nYear range: {movies_df['year'].min():.0f} - {movies_df['year'].max():.0f}")

Extracting Year from Titles...
Year extracted successfully!

Movies with year: 62,011
Movies without year: 412

Year range: 1874 - 2019


In [18]:
# Sample of processed movies
print("Sample of Processed Movies:")
movies_df[['movieId', 'title', 'cleanTitle', 'year', 'genres']].head(10)

Sample of Processed Movies:


Unnamed: 0,movieId,title,cleanTitle,year,genres
0,1,Toy Story (1995),Toy Story,1995.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Jumanji,1995.0,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Grumpier Old Men,1995.0,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Waiting to Exhale,1995.0,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Father of the Bride Part II,1995.0,[Comedy]
5,6,Heat (1995),Heat,1995.0,"[Action, Crime, Thriller]"
6,7,Sabrina (1995),Sabrina,1995.0,"[Comedy, Romance]"
7,8,Tom and Huck (1995),Tom and Huck,1995.0,"[Adventure, Children]"
8,9,Sudden Death (1995),Sudden Death,1995.0,[Action]
9,10,GoldenEye (1995),GoldenEye,1995.0,"[Action, Adventure, Thriller]"


## 10. Calculate Movie Statistics

In [19]:
# Calculate average rating and rating count for each movie
print("Calculating Movie Statistics...")

movie_stats = ratings_df.groupby('movieId').agg(
    avgRating=('rating', 'mean'),
    ratingCount=('rating', 'count')
).reset_index()

# Round avgRating to 2 decimal places
movie_stats['avgRating'] = movie_stats['avgRating'].round(2)

print(f"Stats calculated for {len(movie_stats):,} movies")
movie_stats.head()

Calculating Movie Statistics...
Stats calculated for 59,047 movies


Unnamed: 0,movieId,avgRating,ratingCount
0,1,3.89,57309
1,2,3.25,24228
2,3,3.14,11804
3,4,2.85,2523
4,5,3.06,11714


In [20]:
# Merge stats with movies dataframe
movies_df = movies_df.merge(movie_stats, on='movieId', how='left')

# Fill missing stats with 0
movies_df['avgRating'] = movies_df['avgRating'].fillna(0)
movies_df['ratingCount'] = movies_df['ratingCount'].fillna(0).astype(int)

print("Stats merged with movies!")
print(f"\nMovies with ratings: {(movies_df['ratingCount'] > 0).sum():,}")
print(f"Movies without ratings: {(movies_df['ratingCount'] == 0).sum():,}")

# Preview
movies_df[['movieId', 'title', 'avgRating', 'ratingCount', 'genres']].head(10)

Stats merged with movies!

Movies with ratings: 59,047
Movies without ratings: 3,376


Unnamed: 0,movieId,title,avgRating,ratingCount,genres
0,1,Toy Story (1995),3.89,57309,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),3.25,24228,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),3.14,11804,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),2.85,2523,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),3.06,11714,[Comedy]
5,6,Heat (1995),3.85,24588,"[Action, Crime, Thriller]"
6,7,Sabrina (1995),3.36,12132,"[Comedy, Romance]"
7,8,Tom and Huck (1995),3.11,1344,"[Adventure, Children]"
8,9,Sudden Death (1995),2.99,3711,[Action]
9,10,GoldenEye (1995),3.42,28265,"[Action, Adventure, Thriller]"


## 11. Text Vectorization with TF-IDF

In [21]:
# TF-IDF Vectorization on genres
print("Applying TF-IDF Vectorization on Genres...")

# Prepare genres for TF-IDF (join list into space-separated string)
genres_for_tfidf = movies_df['genres'].apply(lambda x: ' '.join(x) if x else '')

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=50,  # Limit to top 50 features
    stop_words=None,
    lowercase=True
)

# Fit and transform
tfidf_matrix = tfidf_vectorizer.fit_transform(genres_for_tfidf)

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Feature Names: {tfidf_vectorizer.get_feature_names_out()}")

Applying TF-IDF Vectorization on Genres...
TF-IDF Matrix Shape: (62423, 21)
Feature Names: ['action' 'adventure' 'animation' 'children' 'comedy' 'crime'
 'documentary' 'drama' 'fantasy' 'fi' 'film' 'horror' 'imax' 'musical'
 'mystery' 'noir' 'romance' 'sci' 'thriller' 'war' 'western']


In [22]:
# Save TF-IDF vectorizer for later use
import pickle

tfidf_path = os.path.join(MODELS_DIR, 'tfidf_vectorizer.pkl')
with open(tfidf_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"TF-IDF Vectorizer saved to: {tfidf_path}")

# Save TF-IDF matrix
tfidf_matrix_path = os.path.join(PROCESSED_DIR, 'tfidf_matrix.pkl')
with open(tfidf_matrix_path, 'wb') as f:
    pickle.dump(tfidf_matrix, f)
print(f"TF-IDF Matrix saved to: {tfidf_matrix_path}")

TF-IDF Vectorizer saved to: ../models_saved\tfidf_vectorizer.pkl
TF-IDF Matrix saved to: ../data/processed\tfidf_matrix.pkl


## 12. Prepare Data for MongoDB

In [23]:
# Prepare movies for MongoDB
print("Preparing Movies Data for MongoDB...")

# Select columns for MongoDB
movies_for_mongo = movies_df[[
    'movieId', 'title', 'cleanTitle', 'genres', 'year', 
    'avgRating', 'ratingCount'
]].copy()

print(f"Prepared {len(movies_for_mongo):,} movies")
movies_for_mongo.head()

Preparing Movies Data for MongoDB...
Prepared 62,423 movies


Unnamed: 0,movieId,title,cleanTitle,genres,year,avgRating,ratingCount
0,1,Toy Story (1995),Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,3.89,57309
1,2,Jumanji (1995),Jumanji,"[Adventure, Children, Fantasy]",1995.0,3.25,24228
2,3,Grumpier Old Men (1995),Grumpier Old Men,"[Comedy, Romance]",1995.0,3.14,11804
3,4,Waiting to Exhale (1995),Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,2.85,2523
4,5,Father of the Bride Part II (1995),Father of the Bride Part II,[Comedy],1995.0,3.06,11714


In [24]:
# Prepare ratings for MongoDB
print("Preparing Ratings Data for MongoDB...")

# Convert timestamp to datetime
if 'timestamp' in ratings_df.columns:
    ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')

# Select columns
ratings_for_mongo = ratings_df[['userId', 'movieId', 'rating', 'timestamp']].copy()

print(f"Prepared {len(ratings_for_mongo):,} ratings")
ratings_for_mongo.head()

Preparing Ratings Data for MongoDB...
Prepared 25,000,095 ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 15:34:04
1,1,306,3.5,2006-05-17 12:26:57
2,1,307,5.0,2006-05-17 12:27:08
3,1,665,5.0,2006-05-17 15:13:40
4,1,899,3.5,2006-05-17 12:21:50


In [25]:
# Create users from ratings
print("Creating Users Data from Ratings...")

user_stats = ratings_df.groupby('userId').agg(
    ratingCount=('rating', 'count'),
    avgRating=('rating', 'mean')
).reset_index()

user_stats['avgRating'] = user_stats['avgRating'].round(2)

print(f"Created {len(user_stats):,} users")
user_stats.head()

Creating Users Data from Ratings...
Created 162,541 users


Unnamed: 0,userId,ratingCount,avgRating
0,1,70,3.81
1,2,184,3.63
2,3,656,3.7
3,4,242,3.38
4,5,101,3.75


## 13. Connect to MongoDB

In [26]:
# Connect to MongoDB
print("Connecting to MongoDB...")

try:
    client = MongoClient(MONGO_URI)
    # Test connection
    client.admin.command('ping')
    db = client[DB_NAME]
    print(f"Connected to MongoDB: {MONGO_URI}")
    print(f"Database: {DB_NAME}")
except Exception as e:
    print(f"Failed to connect to MongoDB: {e}")
    print("Make sure MongoDB is running")
    raise

Connecting to MongoDB...
Connected to MongoDB: mongodb://localhost:27017
Database: movie_recommendation


## 14. Import Data to MongoDB

In [27]:
# Import Movies to MongoDB
print("Importing Movies to MongoDB...")

# Clear existing data
db.movies.delete_many({})

# Convert to records
movies_records = movies_for_mongo.to_dict('records')

# Batch insert
batch_size = 5000
for i in tqdm(range(0, len(movies_records), batch_size), desc="Inserting movies"):
    batch = movies_records[i:i + batch_size]
    db.movies.insert_many(batch)

print(f"Imported {len(movies_records):,} movies")

Importing Movies to MongoDB...


Inserting movies: 100%|██████████| 13/13 [00:11<00:00,  1.11it/s]

Imported 62,423 movies





In [28]:
# Import Users to MongoDB
print("Importing Users to MongoDB...")

# Clear existing data
db.users.delete_many({})

# Convert to records
users_records = user_stats.to_dict('records')

# Insert
db.users.insert_many(users_records)

print(f"Imported {len(users_records):,} users")

Importing Users to MongoDB...
Imported 162,541 users


In [29]:
# Import Ratings to MongoDB
print("Importing Ratings to MongoDB...")

# Clear existing data
db.ratings.delete_many({})

# Convert to records
ratings_records = ratings_for_mongo.to_dict('records')

# Batch insert (larger batches for speed)
batch_size = 10000
for i in tqdm(range(0, len(ratings_records), batch_size), desc="Inserting ratings"):
    batch = ratings_records[i:i + batch_size]
    db.ratings.insert_many(batch)

print(f"Imported {len(ratings_records):,} ratings")

Importing Ratings to MongoDB...


Inserting ratings: 100%|██████████| 2501/2501 [51:18<00:00,  1.23s/it]    

Imported 25,000,095 ratings





In [30]:
# Create indexes for better query performance
print("Creating Indexes...")

def safe_create_index(collection, keys, **kwargs):
    """Create index, drop existing if conflicts."""
    try:
        collection.create_index(keys, **kwargs)
    except Exception as e:
        if "IndexKeySpecsConflict" in str(e) or "existing index" in str(e):
            # Drop the conflicting index and recreate
            index_name = collection.create_index(keys, **kwargs) if isinstance(keys, str) else None
            if index_name is None:
                # For compound indexes, generate name
                if isinstance(keys, list):
                    index_name = "_".join([f"{k}_{v}" for k, v in keys])
                else:
                    index_name = f"{keys}_1"
            try:
                collection.drop_index(index_name)
                collection.create_index(keys, **kwargs)
            except:
                pass  # Index might not exist or already correct
        else:
            print(f"  Warning: {e}")

# Movies indexes
safe_create_index(db.movies, 'movieId', unique=True)
safe_create_index(db.movies, 'avgRating')
safe_create_index(db.movies, 'ratingCount')

# Users indexes
safe_create_index(db.users, 'userId', unique=True)

# Ratings indexes - use unique=True to match existing
safe_create_index(db.ratings, [('userId', 1), ('movieId', 1)], unique=True)
safe_create_index(db.ratings, 'movieId')
safe_create_index(db.ratings, 'userId')

print("Indexes created!")

Creating Indexes...
Indexes created!


## 15. Verify Data in MongoDB

In [31]:
# Verify data import
print("Verifying Data in MongoDB...")
print("=" * 60)

# Count documents
movies_count = db.movies.count_documents({})
users_count = db.users.count_documents({})
ratings_count = db.ratings.count_documents({})

print(f"\nCollection Counts:")
print(f"  Movies:  {movies_count:,}")
print(f"  Users:   {users_count:,}")
print(f"  Ratings: {ratings_count:,}")

# Verify counts match
print(f"\nVerification:")
print(f"  Movies match:  {'OK' if movies_count == len(movies_for_mongo) else 'FAIL'}")
print(f"  Users match:   {'OK' if users_count == len(user_stats) else 'FAIL'}")
print(f"  Ratings match: {'OK' if ratings_count == len(ratings_for_mongo) else 'FAIL'}")

Verifying Data in MongoDB...

Collection Counts:
  Movies:  62,423
  Users:   162,541
  Ratings: 25,000,095

Verification:
  Movies match:  OK
  Users match:   OK
  Ratings match: OK


In [32]:
# Sample documents from each collection
print("Sample Documents:")
print("=" * 60)

print("\nSample Movie:")
sample_movie = db.movies.find_one()
for key, value in sample_movie.items():
    if key != '_id':
        print(f"  {key}: {value}")

Sample Documents:

Sample Movie:
  movieId: 1
  title: Toy Story (1995)
  cleanTitle: Toy Story
  genres: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']
  year: 1995.0
  avgRating: 3.89
  ratingCount: 57309


In [33]:
print("\nSample User:")
sample_user = db.users.find_one()
for key, value in sample_user.items():
    if key != '_id':
        print(f"  {key}: {value}")

print("\nSample Rating:")
sample_rating = db.ratings.find_one()
for key, value in sample_rating.items():
    if key != '_id':
        print(f"  {key}: {value}")


Sample User:
  userId: 1
  ratingCount: 70
  avgRating: 3.81

Sample Rating:
  userId: 1
  movieId: 296
  rating: 5.0
  timestamp: 2006-05-17 15:34:04


## 16. Save Processed Data to CSV (Backup)

In [34]:
# Save processed data to CSV as backup
print("Saving Processed Data to CSV...")

# Save movies (convert genres list to string for CSV)
movies_csv = movies_for_mongo.copy()
movies_csv['genres'] = movies_csv['genres'].apply(lambda x: '|'.join(x) if x else '')
movies_csv.to_csv(os.path.join(PROCESSED_DIR, 'movies_processed.csv'), index=False)
print(f"  movies_processed.csv saved")

# Save ratings
ratings_for_mongo.to_csv(os.path.join(PROCESSED_DIR, 'ratings_processed.csv'), index=False)
print(f"  ratings_processed.csv saved")

# Save users
user_stats.to_csv(os.path.join(PROCESSED_DIR, 'users_processed.csv'), index=False)
print(f"  users_processed.csv saved")

print(f"\nProcessed files saved to: {PROCESSED_DIR}")

Saving Processed Data to CSV...
  movies_processed.csv saved
  ratings_processed.csv saved
  users_processed.csv saved

Processed files saved to: ../data/processed


## Summary

In [35]:
# Final Summary
print("=" * 60)
print("DATA PREPARATION COMPLETED!")
print("=" * 60)

print(f"""
Dataset Statistics:
  - Movies:  {movies_count:,}
  - Users:   {users_count:,}
  - Ratings: {ratings_count:,}

Files Created:
  - {PROCESSED_DIR}/movies_processed.csv
  - {PROCESSED_DIR}/ratings_processed.csv
  - {PROCESSED_DIR}/users_processed.csv
  - {MODELS_DIR}/tfidf_vectorizer.pkl
  - {PROCESSED_DIR}/tfidf_matrix.pkl

MongoDB Collections:
  - movies (indexed: movieId, avgRating, ratingCount)
  - users (indexed: userId)
  - ratings (indexed: userId+movieId, movieId, userId)

Next Steps:
  1. Run Notebook 02: Data Exploration
  2. Run Notebook 03: Model Training
  3. Run Notebook 04: Model Evaluation
""")

# Close MongoDB connection
client.close()
print("MongoDB connection closed.")

DATA PREPARATION COMPLETED!

Dataset Statistics:
  - Movies:  62,423
  - Users:   162,541
  - Ratings: 25,000,095

Files Created:
  - ../data/processed/movies_processed.csv
  - ../data/processed/ratings_processed.csv
  - ../data/processed/users_processed.csv
  - ../models_saved/tfidf_vectorizer.pkl
  - ../data/processed/tfidf_matrix.pkl

MongoDB Collections:
  - movies (indexed: movieId, avgRating, ratingCount)
  - users (indexed: userId)
  - ratings (indexed: userId+movieId, movieId, userId)

Next Steps:
  1. Run Notebook 02: Data Exploration
  2. Run Notebook 03: Model Training
  3. Run Notebook 04: Model Evaluation

MongoDB connection closed.
