In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [11]:
df = pd.read_csv('mymoviedb.csv',lineterminator='\n')


In [12]:
df.head(), df.columns

(  Release_Date                    Title  \
 0   2021-12-15  Spider-Man: No Way Home   
 1   2022-03-01               The Batman   
 2   2022-02-25                  No Exit   
 3   2021-11-24                  Encanto   
 4   2021-12-22           The King's Man   
 
                                             Overview  Popularity  Vote_Count  \
 0  Peter Parker is unmasked and no longer able to...    5083.954        8940   
 1  In his second year of fighting crime, Batman u...    3827.658        1151   
 2  Stranded at a rest stop in the mountains durin...    2618.087         122   
 3  The tale of an extraordinary family, the Madri...    2402.201        5076   
 4  As a collection of history's worst tyrants and...    1895.511        1793   
 
    Vote_Average Original_Language                               Genre  \
 0           8.3                en  Action, Adventure, Science Fiction   
 1           8.1                en            Crime, Mystery, Thriller   
 2           6.3        

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


In [14]:
df.duplicated().sum()

0

In [15]:
df.isnull().sum()

Release_Date         0
Title                0
Overview             0
Popularity           0
Vote_Count           0
Vote_Average         0
Original_Language    0
Genre                0
Poster_Url           0
dtype: int64

In [16]:
# Drop columns not useful for prediction (but keep Title!)
df = df.drop(['Release_Date', 'Overview', 'Poster_Url'], axis=1)

In [17]:
print(df.columns)

Index(['Title', 'Popularity', 'Vote_Count', 'Vote_Average',
       'Original_Language', 'Genre'],
      dtype='object')


In [18]:
# Remove rows with missing data
df = df.dropna()

In [19]:
# Set threshold to decide Hit or Flop based on Popularity
threshold = df['Popularity'].median()

# Create Hit/Flop column
df['Hit_Flop'] = df['Popularity'].apply(lambda x: 1 if x > threshold else 0)


In [20]:
from sklearn.preprocessing import LabelEncoder

le_genre = LabelEncoder()
df['Genre'] = le_genre.fit_transform(df['Genre'])

le_lang = LabelEncoder()
df['Original_Language'] = le_lang.fit_transform(df['Original_Language'])


In [21]:
from sklearn.preprocessing import LabelEncoder

le_genre = LabelEncoder()
df['Genre'] = le_genre.fit_transform(df['Genre'])

le_lang = LabelEncoder()
df['Original_Language'] = le_lang.fit_transform(df['Original_Language'])


In [22]:
# Save Title separately for later mapping
titles = df['Title']

# For Regression (Popularity prediction)
X_reg = df.drop(['Popularity', 'Hit_Flop', 'Title'], axis=1)
y_reg = df['Popularity']

# For Classification (Hit/Flop prediction)
X_clf = df.drop(['Hit_Flop', 'Title'], axis=1)
y_clf = df['Hit_Flop']


In [23]:
from sklearn.model_selection import train_test_split

X_train_reg, X_test_reg, y_train_reg, y_test_reg, titles_train_reg, titles_test_reg = train_test_split(
    X_reg, y_reg, titles, test_size=0.2, random_state=42)

X_train_clf, X_test_clf, y_train_clf, y_test_clf, titles_train_clf, titles_test_clf = train_test_split(
    X_clf, y_clf, titles, test_size=0.2, random_state=42)


In [24]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_reg = scaler.fit_transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

X_train_clf = scaler.fit_transform(X_train_clf)
X_test_clf = scaler.transform(X_test_clf)


In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# Regression Model
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)

# Classification Model
clf_model = RandomForestClassifier()
clf_model.fit(X_train_clf, y_train_clf)


In [26]:
# Regression: Box Office Prediction
y_pred_reg = reg_model.predict(X_test_reg)

# Classification: Hit or Flop
y_pred_clf = clf_model.predict(X_test_clf)

# Show some predictions with Titles
results = pd.DataFrame({
    'Title': titles_test_reg.values,
    'Actual Popularity': y_test_reg.values,
    'Predicted Popularity': y_pred_reg,
    'Actual Hit/Flop': y_test_clf.values,
    'Predicted Hit/Flop': y_pred_clf
})

# Hit = 1, Flop = 0
results['Actual Hit/Flop'] = results['Actual Hit/Flop'].map({1: 'Hit', 0: 'Flop'})
results['Predicted Hit/Flop'] = results['Predicted Hit/Flop'].map({1: 'Hit', 0: 'Flop'})

print(results.head(10))  # Show first 10 results


                         Title  Actual Popularity  Predicted Popularity  \
0        A Cock and Bull Story             14.960             33.602926   
1     Piranha II: The Spawning             17.674             28.412875   
2               The Assignment             15.218             30.071242   
3                      Bandits             15.779             36.525693   
4    Puff: Wonders of the Reef             51.844             34.830674   
5                   Sinister 2             29.155             38.087187   
6               American Ninja             21.222             36.047453   
7                   Malnazidos             28.257             22.514745   
8  Undercover Wedding Crashers             59.763             32.360949   
9                    The Thing             35.039             57.618320   

  Actual Hit/Flop Predicted Hit/Flop  
0            Flop               Flop  
1            Flop               Flop  
2            Flop               Flop  
3            Flop 