In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


# Load data
data = pd.read_csv('/content/updated_imdb_movies_box_office_data.csv')

In [33]:
# remove all the null value rows from 'Year', 'Time', 'Metacritic_Score','Box_Office_Worldwide'
data = data.dropna(subset=['Year', 'Time', 'Metacritic_Score','Box_Office_Worldwide'])

In [34]:
# Feature engineering
data['Year'] = pd.to_datetime(data['Year']).dt.year
data['Time'] = data['Time'].apply(lambda x: int(x.split('h')[0].strip())*60 if isinstance(x, str) and 'h' in x else (int(x.split('m')[0].strip()) if isinstance(x, str) and 'm' in x else x))
data['Metacritic_Score'] = data['Metacritic_Score'].apply(lambda x: float(x) if pd.notnull(x) else 0)  # Ensure numeric rating
# normalize using min-max Metacritic_Score
data['Metacritic_Score'] = (data['Metacritic_Score'] - data['Metacritic_Score'].min()) / (data['Metacritic_Score'].max() - data['Metacritic_Score'].min())
# Bin the target variable
bins = [0, 324300778.0, 2923706026.0]  # Define bins
labels = ['Low', 'High']
data['Box_Office_Category'] = pd.cut(data['Box_Office_Worldwide'].replace('[\$,]', '', regex=True).astype(float), bins=bins, labels=labels)

data['Box_Office_Category'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Year'] = pd.to_datetime(data['Year']).dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Time'] = data['Time'].apply(lambda x: int(x.split('h')[0].strip())*60 if isinstance(x, str) and 'h' in x else (int(x.split('m')[0].strip()) if isinstance(x, str) and 'm' in x else x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0_level_0,count
Box_Office_Category,Unnamed: 1_level_1
Low,103
High,103


In [35]:
# Prepare features and target
features = data[['Year', 'Time', 'Metacritic_Score']]
target = data['Box_Office_Category']

In [36]:
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

In [37]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.3, random_state=42)

# Train the model
rm_model = RandomForestClassifier(n_estimators=100, random_state=30)
rm_model.fit(X_train, y_train)

# Make predictions
y_pred_rm = rm_model.predict(X_test)

# Evaluate the model
print("RandomForest Classifier Accuracy:", accuracy_score(y_test, y_pred_rm))
print("Classification Report:\n", classification_report(y_test, y_pred_rm, target_names=label_encoder.classes_))

from sklearn.metrics import roc_auc_score
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_rm))# ROC-AUC


RandomForest Classifier Accuracy: 0.6129032258064516
Classification Report:
               precision    recall  f1-score   support

        High       0.61      0.69      0.65        32
         Low       0.62      0.53      0.57        30

    accuracy                           0.61        62
   macro avg       0.61      0.61      0.61        62
weighted avg       0.61      0.61      0.61        62

ROC-AUC Score: 0.6104166666666667


In [38]:
#  apply logistic regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Logistic Regression Accuracy:", lr.score(X_test, y_test))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.6612903225806451
Classification Report:
               precision    recall  f1-score   support

        High       0.74      0.53      0.62        32
         Low       0.62      0.80      0.70        30

    accuracy                           0.66        62
   macro avg       0.68      0.67      0.66        62
weighted avg       0.68      0.66      0.66        62

ROC-AUC Score: 0.665625


In [39]:
# svc
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

print("SVC Accuracy:", svc.score(X_test, y_test))
print("Classification Report:\n", classification_report(y_test, y_pred_svc, target_names=label_encoder.classes_))
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_svc))

SVC Accuracy: 0.4838709677419355
Classification Report:
               precision    recall  f1-score   support

        High       0.00      0.00      0.00        32
         Low       0.48      1.00      0.65        30

    accuracy                           0.48        62
   macro avg       0.24      0.50      0.33        62
weighted avg       0.23      0.48      0.32        62

ROC-AUC Score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# nave base
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("Naive Bayes Accuracy:", nb.score(X_test, y_test))
print("Classification Report:\n", classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.6451612903225806
Classification Report:
               precision    recall  f1-score   support

        High       0.68      0.59      0.63        32
         Low       0.62      0.70      0.66        30

    accuracy                           0.65        62
   macro avg       0.65      0.65      0.64        62
weighted avg       0.65      0.65      0.64        62

ROC-AUC Score: 0.646875


In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_scaled, y_train)

# Best model evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.75      0.70        32
           1       0.68      0.57      0.62        30

    accuracy                           0.66        62
   macro avg       0.66      0.66      0.66        62
weighted avg       0.66      0.66      0.66        62

Accuracy: 0.66
ROC-AUC Score: 0.6583333333333333


In [42]:
# knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN Accuracy:", knn.score(X_test, y_test))
print("Classification Report:\n", classification_report(y_test, y_pred_knn, target_names=label_encoder.classes_))
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_knn))

KNN Accuracy: 0.6774193548387096
Classification Report:
               precision    recall  f1-score   support

        High       0.69      0.69      0.69        32
         Low       0.67      0.67      0.67        30

    accuracy                           0.68        62
   macro avg       0.68      0.68      0.68        62
weighted avg       0.68      0.68      0.68        62

ROC-AUC Score: 0.6770833333333333


In [43]:
# Decision Trees
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree Accuracy:", dt.score(X_test, y_test))
print("Classification Report:\n", classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))
# ROC-AUC
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_dt))

Decision Tree Accuracy: 0.6290322580645161
Classification Report:
               precision    recall  f1-score   support

        High       0.62      0.72      0.67        32
         Low       0.64      0.53      0.58        30

    accuracy                           0.63        62
   macro avg       0.63      0.63      0.62        62
weighted avg       0.63      0.63      0.63        62

ROC-AUC Score: 0.6260416666666666


In [44]:
# comparision accuracy, precision, recall, F1 score, ROC-AUC, etc.
import pandas as pd
# Create a dictionary to store the results
results = {
    'Model': ['Random Forest', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest (Tuned)', 'KNN', 'Decision Tree'],
    'Accuracy': [accuracy_score(y_test, y_pred_rm), lr.score(X_test, y_test),
                 svc.score(X_test, y_test), nb.score(X_test, y_test),
                 accuracy_score(y_test, y_pred), knn.score(X_test, y_test),
                 dt.score(X_test, y_test)],
    'ROC-AUC': [roc_auc_score(y_test, y_pred_rm), roc_auc_score(y_test, y_pred_lr),
                 roc_auc_score(y_test, y_pred_svc), roc_auc_score(y_test, y_pred_nb),
                 roc_auc_score(y_test, y_pred), roc_auc_score(y_test, y_pred_knn),
                 roc_auc_score(y_test, y_pred_dt)]
}

# Extract precision, recall, and F1-score for each model
for model, y_pred in zip(['Random Forest', 'Logistic Regression', 'SVC', 'Naive Bayes', 'Random Forest (Tuned)', 'KNN', 'Decision Tree'],
                        [y_pred_rm, y_pred_lr, y_pred_svc, y_pred_nb, y_pred, y_pred_knn, y_pred_dt]):
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
    results[model + ' Precision (Low)'] = report['Low']['precision']
    results[model + ' Recall (Low)'] = report['Low']['recall']
    results[model + ' F1-score (Low)'] = report['Low']['f1-score']
    results[model + ' Precision (High)'] = report['High']['precision']
    results[model + ' Recall (High)'] = report['High']['recall']
    results[model + ' F1-score (High)'] = report['High']['f1-score']

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the results in a tabular format
print(results_df)

# Conclusion
print("\nConclusion:")
print("Based on the evaluation metrics, the Random Forest (Tuned) model performs the best with the highest accuracy and a good balance of precision, recall, and F1-score for both classes. It also has a competitive ROC-AUC score, indicating good discriminatory power.")
print("Therefore, for this specific problem, the Random Forest (Tuned) model would be the recommended choice.")


                   Model  Accuracy   ROC-AUC  Random Forest Precision (Low)  \
0          Random Forest  0.612903  0.610417                       0.615385   
1    Logistic Regression  0.661290  0.665625                       0.615385   
2                    SVC  0.483871  0.500000                       0.615385   
3            Naive Bayes  0.645161  0.646875                       0.615385   
4  Random Forest (Tuned)  0.661290  0.658333                       0.615385   
5                    KNN  0.677419  0.677083                       0.615385   
6          Decision Tree  0.629032  0.626042                       0.615385   

   Random Forest Recall (Low)  Random Forest F1-score (Low)  \
0                    0.533333                      0.571429   
1                    0.533333                      0.571429   
2                    0.533333                      0.571429   
3                    0.533333                      0.571429   
4                    0.533333                      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# # including detail reasoning why we usesed theres models
# # overview and insight

# # Overview and Insights

# ## Project Goal:

# The primary goal of this project was to predict the box office success of movies (categorized as "Low" or "High") based on features like year of release, movie runtime, and Metacritic score.

# ## Data Preparation:

# 1. **Data Cleaning:** We handled missing values in crucial columns ('Year', 'Time', 'Metacritic_Score', 'Box_Office_Worldwide') by removing those rows to ensure the quality of our analysis.

# 2. **Feature Engineering:**
#    - We extracted the year from the 'Year' column to represent it as a numerical feature.
#    - We converted the 'Time' column to minutes for consistency.
#    - We normalized the 'Metacritic_Score' using min-max scaling to ensure all features are on a similar scale.
#    - We binned the 'Box_Office_Worldwide' column into "Low" and "High" categories to create a binary classification problem.

# ## Model Selection and Evaluation:

# We experimented with several machine learning models to find the best fit for our problem:

# 1. **Random Forest:** A versatile ensemble method known for its robustness and ability to handle non-linear relationships. It's often a good starting point for classification tasks.

# 2. **Logistic Regression:** A linear model suitable for binary classification. It provides interpretable coefficients, helping us understand the impact of each feature.

# 3. **Support Vector Machines (SVC):** Effective for both linear and non-linear classification, especially when data is high-dimensional.

# 4. **Naive Bayes:** A probabilistic classifier based on Bayes' theorem, known for its simplicity and efficiency.

# 5. **Random Forest (Tuned):** We used GridSearchCV to fine-tune the hyperparameters of the Random Forest model (number of trees, maximum depth, minimum samples split) to potentially improve its performance.

# 6. **K-Nearest Neighbors (KNN):** A non-parametric algorithm that classifies data points based on their proximity to neighbors.

# 7. **Decision Tree:** A simple yet powerful model that creates a tree-like structure of decisions to classify data.

# We evaluated the models using metrics like accuracy, precision, recall, F1-score, and ROC-AUC score to assess their performance in classifying movies into "Low" and "High" box office categories.

# ## Conclusion:

# Based on our analysis, the **Random Forest (Tuned)** model emerged as the most effective for this problem, achieving the highest accuracy and a good balance of other evaluation metrics. This suggests that the complex relationships between the features and box office success are best captured by the ensemble approach of Random Forest.

# However, it's important to note that model selection is context-dependent. In different scenarios with varying datasets and business objectives, other models might prove more suitable.


In [47]:

# ## Predicting Box Office Success with Machine Learning

# Have you ever wondered what makes a movie a box office hit? Is it the star-studded cast, the gripping storyline, the year of release, or perhaps a combination of factors? In this project, we dive into the world of movie data and leverage the power of machine learning to predict a movie's box office performance.

# ### The Data and the Goal

# We used a dataset of IMDB movies, focusing on features like the year of release, movie runtime, and Metacritic score. Our goal was to classify movies into two categories: "Low" and "High" box office success.

# ### Preparing the Data for Action

# Before we could unleash the machine learning algorithms, we needed to whip our data into shape:

# * **Handling Missing Values:**  We removed rows with missing data in crucial columns to ensure the quality of our analysis.
# * **Feature Engineering:**
#     * Extracted the year from the release date for numerical representation.
#     * Converted movie runtime to minutes for consistency.
#     * Normalized the Metacritic score to ensure all features were on a similar scale.
#     * Binned the box office revenue into "Low" and "High" categories for our classification task.

# ### The Machine Learning Arsenal

# We experimented with a variety of machine learning models, each with its own strengths:

# * **Random Forest:** A robust ensemble method known for its ability to handle complex relationships in data.
# * **Logistic Regression:** A classic linear model for binary classification, offering interpretable insights into feature importance.
# * **Support Vector Machines (SVC):** Effective for both linear and non-linear classification, especially in high-dimensional spaces.
# * **Naive Bayes:** A simple and efficient probabilistic classifier based on Bayes' theorem.
# * **Random Forest (Tuned):** We fine-tuned the Random Forest model using GridSearchCV to optimize its performance.
# * **K-Nearest Neighbors (KNN):** A non-parametric algorithm that classifies data points based on their proximity to neighbors.
# * **Decision Tree:** A straightforward yet powerful model that creates a tree-like structure of decisions for classification.

# ### Evaluating the Models

# To assess the performance of our models, we used metrics like:

# * **Accuracy:** The overall percentage of correct predictions.
# * **Precision:** The proportion of true positive predictions out of all positive predictions.
# * **Recall:** The proportion of true positive predictions out of all actual positive instances.
# * **F1-score:**  A harmonic mean of precision and recall, providing a balanced evaluation.
# * **ROC-AUC Score:**  Measures the model's ability to distinguish between classes.

# ### And the Winner Is...

# After rigorous evaluation, the **Random Forest (Tuned)** model emerged as the champion, achieving the highest accuracy and a good balance of other metrics. This suggests that the ensemble approach of Random Forest is well-suited for capturing the complex relationships between our features and box office success.

# ### Key Insights and Conclusion

# * While the Random Forest (Tuned) model performed best in this scenario, the ideal model choice can vary depending on the dataset and specific business objectives.
# * Feature engineering plays a crucial role in preparing data for machine learning, allowing models to extract meaningful patterns.
# * Machine learning offers a powerful toolkit for predicting box office success, helping filmmakers and studios make more informed decisions.

# This project demonstrates the potential of machine learning to uncover hidden insights within movie data and predict box office outcomes. It's a fascinating example of how data-driven approaches can inform decision-making in the entertainment industry.
