In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

print("📚 Loading cleaned movie dataset...")

# Load the cleaned dataset
movies_df = pd.read_csv('data/dataset/movies_clean.csv')

print(f"Dataset shape: {movies_df.shape}")
print(f"Columns: {list(movies_df.columns)}")
print("\n✅ Dataset loaded successfully!")

📚 Loading cleaned movie dataset...
Dataset shape: (3022, 20)
Columns: ['id', 'title', 'overview', 'tagline', 'genres_str', 'genres_names', 'keywords_names', 'release_date', 'release_year', 'budget', 'revenue', 'runtime', 'popularity', 'vote_average', 'vote_count', 'production_companies_names', 'cast_names', 'original_language', 'status', 'combined_features']

✅ Dataset loaded successfully!


In [2]:
print("🔍 EXPLORING THE CLEANED DATASET")
print("="*50)

# Basic info
print(f"Total movies: {len(movies_df):,}")
print(f"Date range: {movies_df['release_year'].min():.0f} - {movies_df['release_year'].max():.0f}")
print(f"Rating range: {movies_df['vote_average'].min():.1f} - {movies_df['vote_average'].max():.1f}")

# Check data types and missing values
print("\n📊 Data Info:")
print(movies_df.info())

print("\n❌ Missing Values:")
missing_data = movies_df.isnull().sum()
print(missing_data[missing_data > 0])

# Sample data
print("\n📖 Sample Data:")
print(movies_df.head(3)[['title', 'vote_average', 'vote_count', 'genres_str', 'release_year']])

🔍 EXPLORING THE CLEANED DATASET
Total movies: 3,022
Date range: 1980 - 2016
Rating range: 2.9 - 8.5

📊 Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          3022 non-null   int64  
 1   title                       3022 non-null   object 
 2   overview                    3022 non-null   object 
 3   tagline                     2827 non-null   object 
 4   genres_str                  3022 non-null   object 
 5   genres_names                3022 non-null   object 
 6   keywords_names              3022 non-null   object 
 7   release_date                3022 non-null   object 
 8   release_year                3022 non-null   float64
 9   budget                      3022 non-null   int64  
 10  revenue                     3022 non-null   int64  
 11  runtime                     3022 

In [3]:
print("🔧 FEATURE ENGINEERING STEP 1: Numerical Features from Metadata")
print("="*60)

# Create a copy for feature engineering
features_df = movies_df.copy()

# 1. Popularity Score (normalized)
features_df['popularity_score'] = (features_df['popularity'] - features_df['popularity'].min()) / (features_df['popularity'].max() - features_df['popularity'].min())

# 2. Rating Score (weighted by vote count)
min_votes = features_df['vote_count'].quantile(0.25)  # 25th percentile
features_df['weighted_rating'] = ((features_df['vote_count'] / (features_df['vote_count'] + min_votes)) * features_df['vote_average']) + ((min_votes / (features_df['vote_count'] + min_votes)) * features_df['vote_average'].mean())

# 3. Budget and Revenue features
features_df['budget_millions'] = features_df['budget'] / 1_000_000
features_df['revenue_millions'] = features_df['revenue'] / 1_000_000
features_df['profit_millions'] = features_df['revenue_millions'] - features_df['budget_millions']
features_df['roi'] = np.where(features_df['budget_millions'] > 0, features_df['profit_millions'] / features_df['budget_millions'], 0)

# 4. Runtime categories
features_df['runtime_category'] = pd.cut(features_df['runtime'], 
                                       bins=[0, 90, 120, 150, 300], 
                                       labels=['Short', 'Medium', 'Long', 'Very_Long'])

# 5. Decade feature
features_df['decade'] = (features_df['release_year'] // 10) * 10

# 6. Movie age (years from release to now)
current_year = 2024
features_df['movie_age'] = current_year - features_df['release_year']

print(f"✅ Created numerical features:")
print(f"   • Popularity score (normalized)")
print(f"   • Weighted rating (vote-adjusted)")
print(f"   • Budget/Revenue in millions")
print(f"   • Profit and ROI")
print(f"   • Runtime categories")
print(f"   • Decade and movie age")

print(f"\n📊 Sample of new features:")
print(features_df[['title', 'popularity_score', 'weighted_rating', 'profit_millions', 'roi', 'runtime_category', 'decade']].head())

🔧 FEATURE ENGINEERING STEP 1: Numerical Features from Metadata
✅ Created numerical features:
   • Popularity score (normalized)
   • Weighted rating (vote-adjusted)
   • Budget/Revenue in millions
   • Profit and ROI
   • Runtime categories
   • Decade and movie age

📊 Sample of new features:
                                      title  popularity_score  \
0                                    Avatar          0.170828   
1  Pirates of the Caribbean: At World's End          0.157845   
2                                   Spectre          0.121590   
3                     The Dark Knight Rises          0.127235   
4                               John Carter          0.049038   

   weighted_rating  profit_millions        roi runtime_category  decade  
0         7.182069      2550.965087  10.763566        Very_Long  2000.0  
1         6.869906       661.000000   2.203333        Very_Long  2000.0  
2         6.300771       635.674609   2.594590             Long  2010.0  
3         7.566463 

In [4]:
print("🔧 FEATURE ENGINEERING STEP 2: Text-Based Features")
print("="*50)

# 1. Genre features - One-hot encoding for top genres
from sklearn.preprocessing import MultiLabelBinarizer

# Get all unique genres
all_genres = []
for genres_list in features_df['genres_names']:
    if isinstance(genres_list, list):
        all_genres.extend(genres_list)
    elif isinstance(genres_list, str):
        # Handle string representation of list
        try:
            import ast
            genres_list = ast.literal_eval(genres_list)
            all_genres.extend(genres_list)
        except:
            pass

# Get top 15 most common genres
top_genres = pd.Series(all_genres).value_counts().head(15).index.tolist()

print(f"Top 15 genres: {top_genres}")

# Create binary features for each top genre
for genre in top_genres:
    features_df[f'genre_{genre.lower().replace(" ", "_")}'] = features_df['genres_names'].apply(
        lambda x: 1 if isinstance(x, list) and genre in x else 0
    )

# 2. Cast features - Create features for popular actors
all_cast = []
for cast_list in features_df['cast_names']:
    if isinstance(cast_list, list):
        all_cast.extend(cast_list[:3])  # Top 3 actors per movie
    elif isinstance(cast_list, str):
        try:
            cast_list = ast.literal_eval(cast_list)
            all_cast.extend(cast_list[:3])
        except:
            pass

# Get top 20 most frequent actors
top_actors = pd.Series(all_cast).value_counts().head(20).index.tolist()

for actor in top_actors:
    features_df[f'actor_{actor.lower().replace(" ", "_")}'] = features_df['cast_names'].apply(
        lambda x: 1 if isinstance(x, list) and actor in x else 0
    )

# 3. TF-IDF features for combined text
print("\nCreating TF-IDF features for content similarity...")

# Prepare text data for TF-IDF
features_df['text_features'] = features_df['combined_features'].fillna('')

# Create TF-IDF vectors for top 100 features
tfidf = TfidfVectorizer(max_features=100, stop_words='english', lowercase=True)
tfidf_matrix = tfidf.fit_transform(features_df['text_features'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                       columns=[f'tfidf_{word}' for word in tfidf.get_feature_names_out()])

# Concatenate with features_df
features_df = pd.concat([features_df.reset_index(drop=True), tfidf_df], axis=1)

print(f"✅ Created text-based features:")
print(f"   • {len(top_genres)} genre binary features")
print(f"   • {len(top_actors)} actor binary features")
print(f"   • 100 TF-IDF features from combined text")
print(f"\nCurrent feature count: {len(features_df.columns)}")

🔧 FEATURE ENGINEERING STEP 2: Text-Based Features
Top 15 genres: ['Drama', 'Comedy', 'Thriller', 'Action', 'Adventure', 'Crime', 'Romance', 'Science Fiction', 'Horror', 'Fantasy', 'Family', 'Mystery', 'Animation', 'History', 'Music']

Creating TF-IDF features for content similarity...
✅ Created text-based features:
   • 15 genre binary features
   • 20 actor binary features
   • 100 TF-IDF features from combined text

Current feature count: 165


In [5]:
print("🔧 FEATURE ENGINEERING STEP 3: Advanced Features")
print("="*45)

# 1. Language features
# Create binary features for top languages
top_languages = features_df['original_language'].value_counts().head(10).index.tolist()
for lang in top_languages:
    features_df[f'lang_{lang}'] = (features_df['original_language'] == lang).astype(int)

# 2. Production company features
all_companies = []
for company_list in features_df['production_companies_names']:
    if isinstance(company_list, list):
        all_companies.extend(company_list[:2])  # Top 2 companies
    elif isinstance(company_list, str):
        try:
            company_list = ast.literal_eval(company_list)
            all_companies.extend(company_list[:2])
        except:
            pass

top_companies = pd.Series(all_companies).value_counts().head(15).index.tolist()
for company in top_companies:
    features_df[f'company_{company.lower().replace(" ", "_")}'] = features_df['production_companies_names'].apply(
        lambda x: 1 if isinstance(x, list) and company in x else 0
    )

# 3. Seasonal features (release month)
features_df['release_month'] = pd.to_datetime(features_df['release_date']).dt.month
features_df['is_summer_release'] = ((features_df['release_month'] >= 6) & (features_df['release_month'] <= 8)).astype(int)
features_df['is_holiday_release'] = (features_df['release_month'].isin([11, 12])).astype(int)

# 4. Rating categories
features_df['rating_category'] = pd.cut(features_df['vote_average'], 
                                      bins=[0, 5, 6.5, 8, 10], 
                                      labels=['Low', 'Medium', 'High', 'Excellent'])

# 5. Popularity tiers
features_df['popularity_tier'] = pd.qcut(features_df['popularity'], 
                                       q=5, 
                                       labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High'])

print(f"✅ Created advanced features:")
print(f"   • {len(top_languages)} language binary features")
print(f"   • {len(top_companies)} production company features")
print(f"   • Seasonal release features")
print(f"   • Rating and popularity categories")

print(f"\nTotal features now: {len(features_df.columns)}")

# Show sample of new features
print(f"\n📊 Sample advanced features:")
advanced_cols = ['title', 'lang_en', 'is_summer_release', 'is_holiday_release', 'rating_category', 'popularity_tier']
print(features_df[advanced_cols].head())

🔧 FEATURE ENGINEERING STEP 3: Advanced Features
✅ Created advanced features:
   • 10 language binary features
   • 15 production company features
   • Seasonal release features
   • Rating and popularity categories

Total features now: 195

📊 Sample advanced features:
                                      title  lang_en  is_summer_release  \
0                                    Avatar        1                  0   
1  Pirates of the Caribbean: At World's End        1                  0   
2                                   Spectre        1                  0   
3                     The Dark Knight Rises        1                  1   
4                               John Carter        1                  0   

   is_holiday_release rating_category popularity_tier  
0                   1            High       Very_High  
1                   0            High       Very_High  
2                   0          Medium       Very_High  
3                   0            High       Very_High  


In [6]:
print("🔧 FEATURE ENGINEERING STEP 4: Movie Similarity Features")
print("="*55)

# Calculate cosine similarity matrix using TF-IDF features
print("Computing movie similarity matrix...")
tfidf_columns = [col for col in features_df.columns if col.startswith('tfidf_')]
tfidf_data = features_df[tfidf_columns]

# Calculate cosine similarity
similarity_matrix = cosine_similarity(tfidf_data)

# For each movie, get top 10 most similar movies
print("Finding top similar movies for each movie...")

similar_movies_features = []
for i, movie_id in enumerate(features_df['id']):
    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity (excluding self)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]  # Top 10 similar
    
    # Extract features from similar movies
    similar_indices = [x[0] for x in sim_scores]
    similar_scores = [x[1] for x in sim_scores]
    
    # Average features of top 5 similar movies
    if similar_indices:
        similar_ratings = features_df.iloc[similar_indices[:5]]['vote_average'].mean()
        similar_popularity = features_df.iloc[similar_indices[:5]]['popularity'].mean()
        avg_similarity = np.mean(similar_scores[:5])
    else:
        similar_ratings = features_df['vote_average'].mean()
        similar_popularity = features_df['popularity'].mean()
        avg_similarity = 0
    
    similar_movies_features.append({
        'avg_similar_rating': similar_ratings,
        'avg_similar_popularity': similar_popularity,
        'avg_similarity_score': avg_similarity
    })

# Add similarity features to main dataframe
similarity_df = pd.DataFrame(similar_movies_features)
features_df = pd.concat([features_df, similarity_df], axis=1)

print(f"✅ Created similarity features:")
print(f"   • Average rating of similar movies")
print(f"   • Average popularity of similar movies")
print(f"   • Average similarity score")

print(f"\n📊 Sample similarity features:")
sim_cols = ['title', 'vote_average', 'avg_similar_rating', 'popularity', 'avg_similar_popularity', 'avg_similarity_score']
print(features_df[sim_cols].head())

🔧 FEATURE ENGINEERING STEP 4: Movie Similarity Features
Computing movie similarity matrix...
Finding top similar movies for each movie...
✅ Created similarity features:
   • Average rating of similar movies
   • Average popularity of similar movies
   • Average similarity score

📊 Sample similarity features:
                                      title  vote_average  avg_similar_rating  \
0                                    Avatar           7.2                5.38   
1  Pirates of the Caribbean: At World's End           6.9                6.52   
2                                   Spectre           6.3                6.34   
3                     The Dark Knight Rises           7.6                6.44   
4                               John Carter           6.1                6.62   

   popularity  avg_similar_popularity  avg_similarity_score  
0  150.437577               27.169152              0.825195  
1  139.082615              127.572379              0.677725  
2  107.376788    

In [7]:
print("🔧 FEATURE ENGINEERING STEP 5: Feature Scaling and Final Preparation")
print("="*65)

# Select numerical features for scaling
numerical_features = [
    'popularity_score', 'weighted_rating', 'budget_millions', 'revenue_millions', 
    'profit_millions', 'roi', 'movie_age', 'release_month', 'vote_count',
    'avg_similar_rating', 'avg_similar_popularity', 'avg_similarity_score'
]

# Scale numerical features
scaler = StandardScaler()
features_df[numerical_features] = scaler.fit_transform(features_df[numerical_features])

# Create categorical feature encodings
le_runtime = LabelEncoder()
features_df['runtime_category_encoded'] = le_runtime.fit_transform(features_df['runtime_category'].astype(str))

le_rating = LabelEncoder()
features_df['rating_category_encoded'] = le_rating.fit_transform(features_df['rating_category'].astype(str))

le_popularity = LabelEncoder()
features_df['popularity_tier_encoded'] = le_popularity.fit_transform(features_df['popularity_tier'].astype(str))

print(f"✅ Applied feature scaling and encoding:")
print(f"   • Scaled {len(numerical_features)} numerical features")
print(f"   • Encoded 3 categorical features")

# Prepare final feature set for modeling
print("\n🎯 PREPARING FINAL FEATURE SET FOR RECOMMENDER MODEL")
print("="*55)

# Select features for recommendation model
feature_columns = (
    numerical_features + 
    [col for col in features_df.columns if col.startswith('genre_')] +
    [col for col in features_df.columns if col.startswith('actor_')] +
    [col for col in features_df.columns if col.startswith('lang_')] +
    [col for col in features_df.columns if col.startswith('company_')] +
    [col for col in features_df.columns if col.startswith('tfidf_')] +
    ['runtime_category_encoded', 'rating_category_encoded', 'popularity_tier_encoded',
     'is_summer_release', 'is_holiday_release']
)

# Create final feature matrix
final_features = features_df[['id', 'title'] + feature_columns].copy()

print(f"📊 FINAL FEATURE SET SUMMARY:")
print(f"Total features: {len(feature_columns)}")
print(f"   • Numerical features: {len(numerical_features)}")
print(f"   • Genre features: {len([col for col in feature_columns if col.startswith('genre_')])}")
print(f"   • Actor features: {len([col for col in feature_columns if col.startswith('actor_')])}")
print(f"   • Language features: {len([col for col in feature_columns if col.startswith('lang_')])}")
print(f"   • Company features: {len([col for col in feature_columns if col.startswith('company_')])}")
print(f"   • TF-IDF features: {len([col for col in feature_columns if col.startswith('tfidf_')])}")
print(f"   • Other features: {len([col for col in feature_columns if not any(col.startswith(prefix) for prefix in ['genre_', 'actor_', 'lang_', 'company_', 'tfidf_'])])}")

print(f"\nDataset shape: {final_features.shape}")
print(f"Memory usage: {final_features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

🔧 FEATURE ENGINEERING STEP 5: Feature Scaling and Final Preparation
✅ Applied feature scaling and encoding:
   • Scaled 12 numerical features
   • Encoded 3 categorical features

🎯 PREPARING FINAL FEATURE SET FOR RECOMMENDER MODEL
📊 FINAL FEATURE SET SUMMARY:
Total features: 177
   • Numerical features: 12
   • Genre features: 15
   • Actor features: 20
   • Language features: 10
   • Company features: 15
   • TF-IDF features: 100
   • Other features: 17

Dataset shape: (3022, 179)
Memory usage: 4.29 MB


In [8]:
print("💾 SAVING ENGINEERED FEATURES")
print("="*30)

# Save the complete engineered features dataset
final_features.to_csv('data/dataset/movie_features_engineered.csv', index=False)
print("✅ Saved complete feature set: 'data/dataset/movie_features_engineered.csv'")

# Save just the feature matrix (without id and title) for ML models
X_features = final_features[feature_columns]
X_features.to_csv('data/dataset/movie_features_matrix.csv', index=False)
print("✅ Saved feature matrix: 'data/dataset/movie_features_matrix.csv'")

# Save feature names for reference
feature_info = {
    'feature_names': feature_columns,
    'numerical_features': numerical_features,
    'total_features': len(feature_columns)
}

import json
with open('data/dataset/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("✅ Saved feature info: 'data/dataset/feature_info.json'")

# Save similarity matrix for content-based recommendations
np.save('data/dataset/similarity_matrix.npy', similarity_matrix)
print("✅ Saved similarity matrix: 'data/dataset/similarity_matrix.npy'")

print(f"\n🎬 FEATURE ENGINEERING COMPLETED!")
print(f"📈 Your movie dataset now has {len(feature_columns)} engineered features ready for:")
print(f"   • Content-based filtering")
print(f"   • Collaborative filtering")
print(f"   • Hybrid recommendation systems")
print(f"   • Machine learning models")

print(f"\n📋 FEATURE BREAKDOWN:")
for i, feature in enumerate(feature_columns[:10], 1):
    print(f"  {i:2d}. {feature}")
if len(feature_columns) > 10:
    print(f"  ... and {len(feature_columns) - 10} more features")

print(f"\n🚀 Ready to build your movie recommender system!")

💾 SAVING ENGINEERED FEATURES
✅ Saved complete feature set: 'data/dataset/movie_features_engineered.csv'
✅ Saved feature matrix: 'data/dataset/movie_features_matrix.csv'
✅ Saved feature info: 'data/dataset/feature_info.json'
✅ Saved similarity matrix: 'data/dataset/similarity_matrix.npy'

🎬 FEATURE ENGINEERING COMPLETED!
📈 Your movie dataset now has 177 engineered features ready for:
   • Content-based filtering
   • Collaborative filtering
   • Hybrid recommendation systems
   • Machine learning models

📋 FEATURE BREAKDOWN:
   1. popularity_score
   2. weighted_rating
   3. budget_millions
   4. revenue_millions
   5. profit_millions
   6. roi
   7. movie_age
   8. release_month
   9. vote_count
  10. avg_similar_rating
  ... and 167 more features

🚀 Ready to build your movie recommender system!
