# Data Preprocessing for Movie Recommendation System

This notebook covers:
1. Data cleaning and validation
2. Feature engineering
3. User-item interaction matrix creation
4. Data splitting for training/validation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.sparse import csr_matrix
import pickle
import sys
sys.path.append('../src')

# Custom modules
from data.preprocessing import DataPreprocessor
from features.feature_engineering import FeatureEngineer

## 1. Load Raw Data

In [None]:
# Load raw data
movies_df = pd.read_csv('../data/raw/movies.csv')
ratings_df = pd.read_csv('../data/raw/ratings.csv')
tags_df = pd.read_csv('../data/raw/tags.csv')

print(f"Movies: {movies_df.shape}")
print(f"Ratings: {ratings_df.shape}")
print(f"Tags: {tags_df.shape}")

## 2. Data Cleaning

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Clean movies data
movies_clean = preprocessor.clean_movies(movies_df)

# Clean ratings data
ratings_clean = preprocessor.clean_ratings(ratings_df)

# Clean tags data
tags_clean = preprocessor.clean_tags(tags_df)

## 3. Feature Engineering

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

# Create user features
user_features = feature_engineer.create_user_features(ratings_clean)

# Create movie features
movie_features = feature_engineer.create_movie_features(movies_clean, ratings_clean)

# Create interaction features
interaction_features = feature_engineer.create_interaction_features(ratings_clean)

## 4. Create User-Item Interaction Matrix

In [None]:
# Create user-item interaction matrix
user_item_matrix = ratings_clean.pivot_table(
    index='userId', 
    columns='movieId', 
    values='rating',
    fill_value=0
)

# Convert to sparse matrix for efficiency
user_item_sparse = csr_matrix(user_item_matrix.values)

print(f"User-item matrix shape: {user_item_matrix.shape}")
print(f"Sparsity: {1 - (user_item_sparse.nnz / (user_item_sparse.shape[0] * user_item_sparse.shape[1])):.4f}")

## 5. Data Splitting

In [None]:
# Split data for training and validation
train_data, val_data = train_test_split(
    ratings_clean, 
    test_size=0.2, 
    random_state=42,
    stratify=ratings_clean['rating']
)

print(f"Training data: {train_data.shape}")
print(f"Validation data: {val_data.shape}")

## 6. Save Processed Data

In [None]:
# Save processed data
train_data.to_csv('../data/processed/train_data.csv', index=False)
val_data.to_csv('../data/processed/val_data.csv', index=False)
user_features.to_csv('../data/processed/user_features.csv', index=False)
movie_features.to_csv('../data/processed/movie_features.csv', index=False)

# Save user-item matrix
with open('../data/processed/user_item_matrix.pkl', 'wb') as f:
    pickle.dump(user_item_matrix, f)

print("Processed data saved successfully!")