# Feature Engineering

In [None]:
import ast
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cpi
from fredapi import Fred
import time
from scipy import stats

from utilites import column_stats

In [None]:
df = pd.read_csv("final_df_with_emotion_scores.csv", low_memory=False)
display(column_stats(df))

In [None]:
df_to_merge = pd.read_csv("final_dataset_cam.csv")
print(df_to_merge.columns.to_list())
display(column_stats(df_to_merge))

In [None]:
def standardize_titles(df, column="title"):
    df[column] = df[column].str.lower().str.strip()
    df[column] = df[column].apply(lambda x: ' '.join(re.sub(r'[^a-z0-9\s]', '', x).split()) if isinstance(x, str) else x)
    return df

df_to_merge = standardize_titles(df_to_merge)

# Merge only the id_x column from df_merged into df
df = df.merge(df_to_merge[['title', 'Team_Review_Score_sum', 'Team_Review_Score_mean', 'Team_Audience_Score_Mean', 'Team_Tomato_Meter_Mean', 'Team_Num_reviews', 'Cast_Review_Score_sum', 'Cast_Review_Score_mean', 'Cast_Audience_Score_Mean', 'Cast_Tomato_Meter_Mean', 'Cast_Num_reviews', 'Cast_creationDate', 'Cast_audienceScore', 'Cast_tomatoMeter', 'Cast_Cluster_Label', 'Team_creationDate', 'Team_audienceScore', 'Team_tomatoMeter', 'Team_Cluster Label']], left_on='title', right_on='title', how='left')

# Display column stats
display(column_stats(df))

In [None]:
display(column_stats(df_to_merge))

### Create CPI Feature

In [None]:
# fred = Fred(api_key='<YOUR-API-KEY-HERE>')
#
# cpi_series = fred.get_series('CPIAUCSL', observation_start='1977-01-01', observation_end='2024-12-31')
#
# cpi_df = pd.DataFrame(cpi_series)
# cpi_df.columns = ['CPI']
#
# cpi_df.to_csv('external/cpi_data.csv')

cpi_df = pd.read_csv("external/cpi_data.csv", names=["date", "cpi"], skiprows=1)
cpi_df["date"] = pd.to_datetime(cpi_df["date"])
cpi_df.set_index("date", inplace=True)
cpi_yearly_df = cpi_df.resample("YE").mean()
cpi_yearly_df["year"] = cpi_yearly_df.index.year
cpi_yearly_df.set_index("year", inplace=True)
display(cpi_yearly_df)

In [None]:
df = df.merge(cpi_yearly_df, left_on="release_year", right_index=True, how="left")
display(df)

### Create Box Office Gross Feature

In [None]:
box_office_gross_df = pd.read_csv("external/box_office_revenue.csv", low_memory=False)
box_office_gross_df.set_index("year", inplace=True)
display(box_office_gross_df)

In [None]:
df = df.merge(box_office_gross_df, left_on="release_year", right_index=True, how="left")
display(df)

### Create Box Office Adjusted Feature

In [None]:
latest_cpi = cpi_yearly_df["cpi"].iloc[-1]
latest_gross = df["box_office_gross"].iloc[-1]

df["box_office_adjusted"] = (
    df["box_office"]
    * (latest_cpi / df["cpi"])
    * (latest_gross / df["box_office_gross"])
)

display(df)

## Encode and Impute Rating

In [None]:
print(df["rating"].unique())

def encode_rating(rating):
    rating_map = {
        np.nan: 0,
        "pg": 1,
        "tvpg": 2,
        "pg-13": 3,
        "nc-17": 4,
        "tvma": 5,
        "r": 6
    }
    return rating_map[rating]

df["rating"] = df["rating"].apply(lambda x: encode_rating(x))

### Encode Distributor

In [None]:
from collections import Counter
import pandas as pd

def process_comma_separated_column(df, column_name, top_n=10):
    # Step 1: Extract valid values and count occurrences
    all_values = []
    for values in df[column_name]:
        if isinstance(values, str):  # Ensure only strings are processed
            all_values.extend([value.strip() for value in values.split(',')])

    value_counts = Counter(all_values)

    # Step 2: Get the `top_n` most frequent values
    top_values = {v for v, _ in value_counts.most_common(top_n)}

    # Step 3: Create boolean columns for the top values
    for value in top_values:
        df[value] = df[column_name].apply(lambda x: isinstance(x, str) and any(v.strip() == value for v in x.split(',')))

    # Step 4: Drop the original column
    df.drop(columns=[column_name], inplace=True)

process_comma_separated_column(df, 'distributor', top_n=10)
process_comma_separated_column(df, 'sound_mix', top_n=5)
process_comma_separated_column(df, 'rating_contents', top_n=10)
process_comma_separated_column(df, 'genre', top_n=10)
process_comma_separated_column(df, 'Cast_Cluster_Label', top_n=10)
process_comma_separated_column(df, 'Team_Cluster Label', top_n=10)


### Encode Belongs to Collection

In [None]:
df["is_in_collection"] = df["belongs_to_collection"].notna()

In [None]:
def safe_eval(value):
    """Safely evaluate a string representation of a dictionary."""
    if pd.isna(value):  # Handle NaN values
        return None
    if isinstance(value, dict):  # Already a dictionary, no need to parse
        return value
    if not isinstance(value, str):  # If it's not a string, return None
        return None
    try:
        parsed_value = ast.literal_eval(value)
        if isinstance(parsed_value, dict):  # Ensure it's a dictionary
            return parsed_value
    except (ValueError, SyntaxError):  # Catch any parsing errors
        return None
    return None

# Convert the column from string to dictionary safely
df["belongs_to_collection"] = df["belongs_to_collection"].apply(safe_eval)

# Extract collection ID
df["collection_id"] = df["belongs_to_collection"].apply(lambda x: x["id"] if isinstance(x, dict) else None)

# Compute the average box office per collection
collection_avg_box_office = df.groupby("collection_id")["box_office"].mean()

# Map the computed averages back to the DataFrame
df["collection_box_office_average"] = df["collection_id"].map(collection_avg_box_office)

# Fill NaN values (movies not in a collection) with 0
df["collection_box_office_average"] = df["collection_box_office_average"].fillna(0)
df.drop(columns=["belongs_to_collection"], inplace=True)

df["collection_id"] = df["collection_id"].fillna(0)

In [None]:
df.drop(columns=["title", "director", "writer", "Dominant_Emotion", "id_x", "Cast_creationDate", "Team_creationDate", "cast", "wiki_page", "plot", "id", "release_year", "audience_score", "tomato_meter"], inplace=True)

In [None]:
df.dropna(subset=["budget"], inplace=True)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

display(column_stats(df))

In [None]:
df.to_csv("../complete_data.csv", index=False)

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

# Assuming df is your DataFrame
X = df.drop(columns=["box_office", "box_office_adjusted"])  # Features
y = np.log1p(df["box_office_adjusted"])  # Log-transform target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define XGBoost model
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 16,
    "learning_rate": 0.1,
    "n_estimators": 1000,
}

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)

# Make predictions and reverse log-transform
y_pred = np.expm1(model.predict(X_test))

# RMSE
rmse = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred))

# MAPE (can still be unstable)
mape = mean_absolute_percentage_error(np.expm1(y_test), y_pred)

# SMAPE Calculation
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

smape_score = smape(np.expm1(y_test), y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4%}")
print(f"SMAPE: {smape_score:.2f}%")
