In [5]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
# from xgboost import XGBClassifier

# Load Data
df = pd.read_csv("./Filtered IMDb Movies TV Shows.csv")

# Define criteria for movie being a hit
df['hit'] = np.where(df['gross'] >= 2 * df['budget'], 1, 0)

# convert release date to datetime
df['release_date'] = pd.to_datetime(df['release_date'])

# Add a column for which season movies where released (Spring = 1, Summer = 2, Fall = 3, Winter = 4)
df['season'] = np.where((df['release_date'].dt.month <= 5) & (df['release_date'].dt.month >= 3), 1, 
                        np.where((df['release_date'].dt.month <= 8) & (df['release_date'].dt.month >= 6), 2,
                        np.where((df['release_date'].dt.month <= 11) & (df['release_date'].dt.month >= 9), 3, 
                        np.where((df['release_date'].dt.month == 12) | (df['release_date'].dt.month <= 2), 4, ''))))

# Filter out movies with no date
df.dropna(subset = ['release_date'], inplace = True)

features = ['budget', 'season', 'runtimeMinutes', 'averageRating', 'numVotes']

X = df[features]
y = df['hit']

# Split into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale sets
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train logistic regression model
lr = LogisticRegression(max_iter = 1000, random_state = 42)
lr.fit(X_train, y_train)

lr_y_pred = lr.predict(X_test)
lr_y_prob = lr.predict_proba(X_test)[:, 1]

# Calculate accuracy and f1 score for logistic regression
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_f1 = f1_score(y_test, lr_y_pred)
roc_auc_lr = roc_auc_score(y_test, lr_y_prob)

# Train random forest classifier
rf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 42)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
rf_y_prob = rf.predict_proba(X_test)[:, 1]

# Calculate accuracy and f1 score for random forest
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
roc_auc_rf = roc_auc_score(y_test, rf_y_prob)

# # Train XGBoost model
# xgb = XGBClassifier(n_estimators=100, max_depth=5, use_label_encoder=False, eval_metric='logloss', random_state=42)
# xgb.fit(X_train, y_train)

# xgb_y_pred = xgb.predict(X_test)
# xgb_y_prob = xgb.predict_proba(X_test)[:, 1]

# xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
# xgb_f1 = f1_score(y_test, xgb_y_pred)
# roc_auc_xgb = roc_auc_score(y_test, xgb_y_prob)

# Print results
print("\nLogistic Regression Results:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"F1 Score: {lr_f1:.4f}")
print(f"ROC AUC: {roc_auc_lr:.4f}")

print("\nRandom Forest Results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"F1 Score: {rf_f1:.4f}")
print(f"ROC AUC: {roc_auc_rf:.4f}")

# print("\nXGBoost Results:")
# print(f"Accuracy: {xgb_accuracy:.4f}")
# print(f"F1 Score: {xgb_f1:.4f}")
# print(f"ROC AUC: {roc_auc_xgb:.4f}")




Logistic Regression Results:
Accuracy: 0.7070
F1 Score: 0.7901
ROC AUC: 0.7940

Random Forest Results:
Accuracy: 0.6861
F1 Score: 0.7732
ROC AUC: 0.7720
