In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA

fred = pd.read_csv("FRED_Data_all.csv")
df = pd.read_csv('movie_only.csv')

# merging
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
fred['year'] = pd.to_datetime(fred['observation_date']).dt.year
df = pd.merge(df, fred, on='year', how='left')

df = df[df['Year'] >= 1960]  # FRED doesn't have data older than 1960
df = df[df['budget'] > 0]    # weird cols

df.drop("year", axis = 1)


print(df.isna().any())
df['runtime'].fillna(df['runtime'].mean(), inplace=True)  # fill in empty runtime

# deal with genres col
df = df.dropna(subset=['genres']).copy()
df['genres'] = df['genres'].fillna('').astype(str).str.split()
mlb = MultiLabelBinarizer()
genre_dummies = pd.DataFrame(
    mlb.fit_transform(df['genres']),
    columns=mlb.classes_,
    index=df.index
)
df = pd.concat([df, genre_dummies], axis=1)

# past mean revenue of directors
df = df.sort_values(["director", "Year", "Month"])
df["director_past_avg_rev"] = (
    df.groupby("director")["revenue"]
      .apply(lambda s: s.shift().expanding().mean())
      .reset_index(level=0, drop=True)
)
df["director_past_avg_rev"] = df["director_past_avg_rev"].fillna(0)

# -------------- mpnet embeddings ------------------------------------------------------

# get embeddings
mpnet = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

df["overview"] = df["overview"].fillna("")
overviews = df["overview"].tolist()

overview_embs = mpnet.encode(
    overviews,
    batch_size=32,
    show_progress_bar=True
)   # shape: (n_samples, 768)

# do pca
pca = PCA(n_components=50, random_state=42)
overview_pca = pca.fit_transform(overview_embs)  # shape: (n_samples, 50)

# align with df
pca_cols = [f"overview_pca_{i}" for i in range(overview_pca.shape[1])]
overview_pca_df = pd.DataFrame(overview_pca, columns=pca_cols, index=df.index)
# ----------------------------------------------------------

base_features = df[['budget','runtime','Year','Month','GDP','INFLATION','INTEREST_RATE','director_past_avg_rev', 
                    'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 
                    'Fiction', 'Foreign', 'History', 'Horror', 'Movie', 'Music', 'Mystery', 'Romance', 'Science', 'TV', 
                    'Thriller', 'War', 'Western', 'revenue']]

# make final dataset
final_df = pd.concat([base_features, overview_pca_df], axis=1)

final_df.to_csv('merged_data.csv', index=False) 
df.head()