### Intial Testing of the Dataset

In [17]:
import numpy as np
import pandas as pd 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import spacy
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer

In [18]:
df = pd.read_csv('data/imdb_top_1000.csv')
df.at[966,"Released_Year"] = 1995 #fixing the wrong value for apollo 13 
df.drop(columns = ["Poster_Link"], inplace= True)
df.dropna(inplace = True) #dropping null valued columns

In [19]:
#Converting to Numeric Values
df["Gross"] = df["Gross"].str.replace(",", "").astype("float")
df["Released_Year"] = df["Released_Year"].astype("int")
df["Runtime"] = df["Runtime"].str.replace(" min", "").astype("int")
df["IMDB_Rating"] = df["IMDB_Rating"].astype(float)
df["Meta_score"] = df["Meta_score"].astype(float)
df["No_of_Votes"] = df["No_of_Votes"].astype(float)
df.head()

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110.0,28341469.0
1,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367.0,134966411.0
2,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232.0,534858444.0
3,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952.0,57300000.0
4,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845.0,4360000.0


In [20]:
#defining functino to encode the target variable
def smooth_target_encode(df, col, target, smoothing_param =.3):
    mean_target = df[target].mean()
    encoded =df.groupby(col)[target].agg(["count", "mean"])
    counts = encoded["count"]
    means = encoded["mean"]
    smooth_encodings = (means*counts + mean_target*smoothing_param)/(counts+smoothing_param)
    return df[col].map(smooth_encodings)

In [21]:
df["IMDB_Rating"] = smooth_target_encode(df, "IMDB_Rating", "Gross")
df["Meta_score"] = smooth_target_encode(df, "Meta_score", "Gross")
df["No_of_Votes"] = smooth_target_encode(df, "No_of_Votes", "Gross")
df["Certificate"] = smooth_target_encode(df, "Certificate", "Gross")
df["Genre"] = smooth_target_encode(df, "Genre", "Gross")
df["Director"] = smooth_target_encode(df, "Director", "Gross")
df["Star1"] = smooth_target_encode(df, "Star1", "Gross")
df["Star2"] = smooth_target_encode(df, "Star2", "Gross")
df["Star3"] = smooth_target_encode(df, "Star3", "Gross")
df["Star4"] = smooth_target_encode(df, "Star4", "Gross")
df["Series_Title"] = smooth_target_encode(df, "Series_Title", "Gross")

### Trying Target Encoding

In [22]:
df.head()

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,39919650.0,1994,66057570.0,142,34786980.0,39919650.0,Two imprisoned men bond over a number of years...,104939900.0,82042140.0,39919650.0,39919650.0,39919650.0,39919650.0,39919650.0,28341469.0
1,121938800.0,1972,66057570.0,175,33280970.0,121938800.0,An organized crime dynasty's aging patriarch t...,25780540.0,69882650.0,73095860.0,58935060.0,121938800.0,93834990.0,121938800.0,134966411.0
2,429548100.0,2008,149944700.0,152,72757340.0,187900800.0,When the menace known as the Joker wreaks havo...,87411430.0,236266000.0,165680200.0,278894000.0,429548100.0,180306200.0,429548100.0,534858444.0
3,62195440.0,1974,66057570.0,202,33280970.0,187900800.0,The early life and career of Vito Corleone in ...,76540530.0,69882650.0,47945130.0,146588500.0,71445910.0,93834990.0,62195440.0,57300000.0
4,21472370.0,1957,94598470.0,96,33280970.0,187900800.0,A jury holdout attempts to prevent a miscarria...,62665480.0,37607920.0,10088060.0,21472370.0,21472370.0,21472370.0,21472370.0,4360000.0


### Using Bert embeddings for overview column, Unesecarry for RF or Xgbost

In [8]:

# Load a lightweight BERT model (fast & efficient)
model = SentenceTransformer('all-MiniLM-L6-v2')  # Outputs 384D embeddings

# Convert each overview into a 384-dimensional vector

overview_embeddings= df['Overview'].apply(lambda x: model.encode(x))
# Define features and target
Y = df["Gross"].values
df = df.drop(columns=["Gross", "Overview"])
X = np.hstack((df.values, overview_embeddings.tolist()))



### Trying without any overview Embeddings

In [24]:
Y = df["Gross"].values
df.drop(columns= ["Gross", "Overview"], inplace = True)
X = np.array(df.values)

In [25]:
# Split the data into training and testing sets before scaling
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [None]:
#Scaling the Features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [27]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32).view(-1, 1)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32).view(-1, 1)

print(X_train_tensor.shape, X_test_tensor.shape, Y_train_tensor.shape, Y_test_tensor.shape)

torch.Size([571, 13]) torch.Size([143, 13]) torch.Size([571, 1]) torch.Size([143, 1])


In [28]:
# Define the neural network architecture
class RegressionModel(nn.Module):
    def __init__(self, input_dim=397, hidden1=128, hidden2=64, dropout_prob=0.2):
        super(RegressionModel, self).__init__()
        
        # Layer 1: Fully connected
        self.fc1 = nn.Linear(input_dim, hidden1)
        # Batch Norm after first layer
        self.bn1 = nn.BatchNorm1d(hidden1)
        # Dropout
        self.dropout1 = nn.Dropout(p=dropout_prob)
    
        # Layer 2: Fully connected
        self.fc2 = nn.Linear(hidden1, hidden2)
        # Batch Norm after second layer
        self.bn2 = nn.BatchNorm1d(hidden2)
        # Dropout
        self.dropout2 = nn.Dropout(p=dropout_prob)
        
        # Output Layer
        self.fc3 = nn.Linear(hidden2, 1)

    def forward(self, x):
        # First layer + activation + batchnorm + dropout
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        # Second layer + activation + batchnorm + dropout
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)
        
        # Output layer (no activation for regression)
        x = self.fc3(x)
        
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
model = RegressionModel(input_dim=input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, Y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    Y_pred_tensor = model(X_test_tensor)
    Y_pred = Y_pred_tensor.numpy().flatten()
    Y_test = Y_test_tensor.numpy().flatten()

    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")

    # Display the first few predictions and actual values
    print("Predictions vs Actual values:")
    for pred, actual in zip(Y_pred[:5], Y_test[:5]):
        print(f"Predicted: {pred}, Actual: {actual}")

Epoch [10/100], Loss: 20190123714936832.0000
Epoch [20/100], Loss: 20190123714936832.0000
Epoch [30/100], Loss: 20190123714936832.0000
Epoch [40/100], Loss: 20190123714936832.0000
Epoch [50/100], Loss: 20190123714936832.0000
Epoch [60/100], Loss: 20190123714936832.0000
Epoch [70/100], Loss: 20190123714936832.0000
Epoch [80/100], Loss: 20190123714936832.0000
Epoch [90/100], Loss: 20190123714936832.0000
Epoch [100/100], Loss: 20190123714936832.0000
Mean Squared Error: 4.634785452056904e+17
R-squared: -47.859657287597656
Predictions vs Actual values:
Predicted: 241045728.0, Actual: 12339633.0
Predicted: 343603264.0, Actual: 38405088.0
Predicted: 207264704.0, Actual: 11487676.0
Predicted: 340724096.0, Actual: 35000000.0
Predicted: 344321280.0, Actual: 35811508.0
