# Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cpi
from fredapi import Fred
import time
from scipy import stats

from utilites import column_stats

In [None]:
df = pd.read_csv("final_data.csv", low_memory=False)
display(column_stats(df))

### Create CPI Feature

In [None]:
# fred = Fred(api_key='<YOUR-API-KEY-HERE>')
#
# cpi_series = fred.get_series('CPIAUCSL', observation_start='1977-01-01', observation_end='2024-12-31')
#
# cpi_df = pd.DataFrame(cpi_series)
# cpi_df.columns = ['CPI']
#
# cpi_df.to_csv('external/cpi_data.csv')

cpi_df = pd.read_csv("external/cpi_data.csv", names=["date", "cpi"], skiprows=1)
cpi_df["date"] = pd.to_datetime(cpi_df["date"])
cpi_df.set_index("date", inplace=True)
cpi_yearly_df = cpi_df.resample("YE").mean()
cpi_yearly_df["year"] = cpi_yearly_df.index.year
cpi_yearly_df.set_index("year", inplace=True)
display(cpi_yearly_df)

In [None]:
df = df.merge(cpi_yearly_df, left_on="release_year", right_index=True, how="left")
display(df)

### Create Box Office Gross Feature

In [None]:
box_office_gross_df = pd.read_csv("external/box_office_revenue.csv", low_memory=False)
box_office_gross_df.set_index("year", inplace=True)
display(box_office_gross_df)

In [None]:
df = df.merge(box_office_gross_df, left_on="release_year", right_index=True, how="left")
display(df)

### Create Box Office Adjusted Feature

In [None]:
latest_cpi = cpi_yearly_df["cpi"].iloc[-1]
latest_gross = df["box_office_gross"].iloc[-1]

df["box_office_adjusted"] = (
    df["box_office"]
    * (latest_cpi / df["cpi"])
    * (latest_gross / df["box_office_gross"])
)

display(df)

In [None]:
sns.boxplot(df, x="box_office")
plt.xscale("log")
plt.show()

sns.boxplot(df, x="box_office_adjusted")
plt.xscale("log")

## Encode and Impute Rating

In [None]:
print(df["rating"].unique())

def encode_rating(rating):
    rating_map = {
        np.nan: 0,
        "pg": 1,
        "tvpg": 2,
        "pg-13": 3,
        "nc-17": 4,
        "tvma": 5,
        "r": 6
    }
    return rating_map[rating]

df["rating"] = df["rating"].apply(lambda x: encode_rating(x))

In [None]:
display(column_stats(df))

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame
X = df.drop(columns=["box_office", "box_office_adjusted"])  # Features
y = df["box_office_adjusted"]  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
params = {
    "objective": "reg:squarederror",  # Standard for regression
    "eval_metric": "rmse",
    "max_depth": 6,
    "learning_rate": 0.1,
    "n_estimators": 100,
}

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


In [None]:
# Compute Z-scores
df["z_score"] = np.abs(stats.zscore(df["box_office"]))

# Define threshold (e.g., 3 standard deviations)
threshold = 3

# Separate outliers and non-outliers
df_outliers = df[df["z_score"] > threshold]
df_clean = df[df["z_score"] <= threshold]

# Drop Z-score column if no longer needed
df_clean = df_clean.drop(columns=["z_score"])
df_outliers = df_outliers.drop(columns=["z_score"])

In [None]:
display(df_outliers)

In [None]:
fred = Fred(api_key='66aa21adb0605067be10836b1b41c061')

In [None]:
# Download the CPI data from FRED
cpi_series = fred.get_series('CPIAUCSL', observation_start='1977-01-01', observation_end='2024-12-31')

# Convert the series to DataFrame while preserving the date index
cpi_df = pd.DataFrame(cpi_series)
cpi_df.columns = ['CPI']  # Name the column explicitly

# Save to CSV with the index (which contains the dates)
cpi_df.to_csv('external/cpi_data.csv')

# Load the saved data
cpi_df = pd.read_csv("external/cpi_data.csv")

# The first column should now be the dates
# Rename it if it has a generic name
if cpi_df.columns[0] == 'Unnamed: 0':
    cpi_df.rename(columns={'Unnamed: 0': 'date'}, inplace=True)

# Convert date to datetime
cpi_df['date'] = pd.to_datetime(cpi_df['date'])
cpi_df.set_index('date', inplace=True)

# Aggregate to yearly CPI (taking the mean of each year's monthly values)
cpi_yearly = cpi_df.resample('Y').mean()
cpi_yearly.index = cpi_yearly.index.year  # Convert index to integer years

# Make sure release_year is an integer
df['release_year'] = df['release_year'].astype(int)

# Get the latest CPI value
current_cpi = cpi_df['CPI'].iloc[-1]  # Or use cpi_yearly['CPI'].iloc[-1] for end-of-year value

# Adjust box office for inflation
df['box_office_adjusted'] = df.apply(
    lambda row: row['box_office'] * (current_cpi / cpi_yearly.loc[row['release_year'], 'CPI'])
    if row['release_year'] in cpi_yearly.index else None,
    axis=1
)

In [None]:
display(df)

In [None]:
display(column_stats(df))

In [None]:
sns.boxplot(df, x="box_office")
plt.xscale("log")
plt.show()


sns.boxplot(df, x="box_office_adjusted")
plt.xscale("log")