In [None]:
# ======================================
# 1️⃣ Import Libraries
# ======================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read CSV
df = pd.read_csv(filename, lineterminator='\n')

# ======================================
# 2️⃣ Data Cleaning
# ======================================
df.dropna(subset=['Genre', 'Popularity', 'Vote_Average', 'Release_Date'], inplace=True)

# Convert Release_Date to year
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce')
df['Release_Year'] = df['Release_Date'].dt.year
df.dropna(subset=['Release_Year'], inplace=True)

# Simplify genres (take first genre only for now)
df['Main_Genre'] = df['Genre'].apply(lambda x: str(x).split(',')[0].strip())

# ======================================
# 3️⃣ Encode Features
# ======================================
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Genre_Code'] = le.fit_transform(df['Main_Genre'])

features = ['Popularity', 'Release_Year', 'Genre_Code']
target = 'Vote_Average'

X = df[features]
y = df[target]

# ======================================
# 4️⃣ Train-Test Split
# ======================================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ======================================
# 5️⃣ Train ML Models
# ======================================
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results[name] = {"MSE": mse, "R2": r2}

# ======================================
# 6️⃣ Compare Model Performance
# ======================================
print("\n📊 Model Performance:")
for name, score in results.items():
    print(f"{name}: MSE={score['MSE']:.3f}, R2={score['R2']:.3f}")

# Visualization of model comparison
plt.figure(figsize=(8, 4))
sns.barplot(x=list(results.keys()), y=[r['R2'] for r in results.values()], palette="Blues_d")
plt.title("R² Score Comparison Across Models")
plt.ylabel("R² Score")
plt.xlabel("Model")
plt.show()

plt.figure(figsize=(8, 4))
sns.barplot(x=list(results.keys()), y=[r['MSE'] for r in results.values()], palette="Reds_d")
plt.title("MSE Comparison Across Models")
plt.ylabel("Mean Squared Error")
plt.xlabel("Model")
plt.show()

# ======================================
# 7️⃣ Select Best Model
# ======================================
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
print(f"\n✅ Best Model: {best_model_name}")

# ======================================
# 8️⃣ Recommendation Function
# ======================================
def recommend_movies(preferred_genre, top_k=10):
    """
    Recommend top predicted movies for a given genre
    """
    if preferred_genre not in le.classes_:
        print("⚠️ Genre not found. Try another.")
        return pd.DataFrame()

    genre_code = le.transform([preferred_genre])[0]
    df_genre = df[df['Genre_Code'] == genre_code].copy()
    df_genre['Predicted_Rating'] = best_model.predict(df_genre[features])

    df_rec = df_genre.sort_values(
        ['Predicted_Rating', 'Popularity'], ascending=False
    ).head(top_k)
    return df_rec[['Title', 'Main_Genre', 'Popularity', 'Release_Year', 'Vote_Average', 'Predicted_Rating']]

# ======================================
# 9️⃣ Show Recommendations
# ======================================
print("\n🎬 Available Genres:")
print(list(le.classes_))

user_genre = input("\nEnter a genre to get recommendations: ").strip()
recs = recommend_movies(user_genre, top_k=10)

if not recs.empty:
    print(f"\n🎥 Top 10 Recommended Movies for '{user_genre}':")
    display(recs)

    # ======================================
    # 🔟 Visualize Recommendation Insights
    # ======================================

    # 1️⃣ Predicted vs Actual Ratings
    plt.figure(figsize=(8, 5))
    sns.scatterplot(
        x=recs['Vote_Average'], y=recs['Predicted_Rating'],
        size=recs['Popularity'], hue=recs['Release_Year'], palette="coolwarm", sizes=(40, 200)
    )
    plt.title(f"Actual vs Predicted Ratings ({user_genre})")
    plt.xlabel("Actual Rating")
    plt.ylabel("Predicted Rating")
    plt.legend(title="Year", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

    # 2️⃣ Popularity vs Predicted Rating
    plt.figure(figsize=(8, 5))
    sns.barplot(
        x='Predicted_Rating', y='Title', data=recs,
        palette='Blues_r', orient='h'
    )
    plt.title(f"Top {len(recs)} {user_genre} Movies (Predicted Rating)")
    plt.xlabel("Predicted Rating")
    plt.ylabel("Movie Title")
    plt.show()

else:
    print("No recommendations available for that genre.")
