In [30]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
# open cleaned dataset
df = pd.read_csv('dataset_1_cleaned.csv')
df.head()

Unnamed: 0,budget,revenue,runtime,vote_average,vote_count,popularity,release_year,release_month,main_genre_Action,main_genre_Adventure,...,original_language_cn,original_language_en,original_language_es,original_language_fr,original_language_hi,original_language_it,original_language_ja,original_language_ko,original_language_ru,original_language_zh
0,150000000,10550000,101,5.869,126,681.3142,2025,3,False,False,...,False,True,False,False,False,False,False,False,False,False
1,40000000,33392248,116,6.9,109,352.8585,2025,3,True,False,...,False,True,False,False,False,False,False,False,False,False
2,180000000,409278201,119,6.118,1204,308.3292,2025,2,True,False,...,False,True,False,False,False,False,False,False,False,False
3,250000,315648,89,6.1,30,223.5105,2025,1,False,False,...,False,True,False,False,False,False,False,False,False,False
4,200000000,721046090,118,7.446,1851,176.5402,2024,12,False,True,...,False,True,False,False,False,False,False,False,False,False


In [32]:
# define features and target
X = df.drop('revenue', axis=1)
y = df['revenue']

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [33]:
# train and evaluate random forest
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("Random Forest")
print("R2 Score:", r2_score(y_test, rf_preds))
print("RMSE:", mean_squared_error(y_test, rf_preds, squared=False))


Random Forest
R2 Score: 0.7810151930229486
RMSE: 103171499.96410175


In [34]:
# train and evaluate xgboost
xgb_model = XGBRegressor(random_state=0)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print("XGBoost")
print("R2 Score:", r2_score(y_test, xgb_preds))
print("RMSE:", mean_squared_error(y_test, xgb_preds, squared=False))


XGBoost
R2 Score: 0.7593643792718523
RMSE: 108151540.50079347


In [35]:
# train and evaluate linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

print("Linear Regression")
print("R2 Score:", r2_score(y_test, lr_preds))
print("RMSE:", mean_squared_error(y_test, lr_preds, squared=False))


Linear Regression
R2 Score: 0.7109023618236765
RMSE: 118542757.11586379


In [36]:
# compare model performances
results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'Linear Regression'],
    'R2 Score': [r2_score(y_test, rf_preds), r2_score(y_test, xgb_preds), r2_score(y_test, lr_preds)],
    'RMSE': [mean_squared_error(y_test, rf_preds, squared=False),
             mean_squared_error(y_test, xgb_preds, squared=False),
             mean_squared_error(y_test, lr_preds, squared=False)]
})

print(results)


               Model  R2 Score          RMSE
0      Random Forest  0.781015  1.031715e+08
1            XGBoost  0.759364  1.081515e+08
2  Linear Regression  0.710902  1.185428e+08


In [37]:
# save the trained model
joblib.dump(rf_model, '1.random_forest_model.pkl')
joblib.dump(xgb_model, '2.xgboost_model.pkl')
joblib.dump(lr_model, '3.linear_regression_model.pkl')


['3.linear_regression_model.pkl']