### Selecting columns, viualizing

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv("data_clean.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110155 entries, 0 to 110154
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   tconst                       110155 non-null  object 
 1   titleType                    110155 non-null  object 
 2   primaryTitle                 110155 non-null  object 
 3   originalTitle                110155 non-null  object 
 4   isAdult                      110155 non-null  int64  
 5   startYear                    110155 non-null  object 
 6   endYear                      110155 non-null  object 
 7   runtimeMinutes               110155 non-null  object 
 8   genres                       110155 non-null  object 
 9   averageRating                110155 non-null  float64
 10  numVotes                     110155 non-null  int64  
 11  Budget                       26636 non-null   float64
 12  Gross US & Canada            18139 non-null   float64
 13 

In [4]:
data["Critic reviews"] = data["Critic reviews"].fillna(0)
data["User reviews"] = data["User reviews"].fillna(0)

data["isAdult2"] = data.apply(lambda row: int("Adult" in row["genres"]), axis=1)
print(len(data[data["isAdult"] != data["isAdult2"]]))  # We use the one based on the genres

# tconst was only required for joins
# titleType is only films for us, we filtered them
# we do not use the titles as predictors
# endYear is None for all films
# isAdult will be added back in a consistent format later on

# We drop writers and directors. These are interesting features,
# but having them as binary columns would be infeasible.
data = data.drop(columns=["tconst", "titleType", "primaryTitle", "originalTitle", "endYear", "isAdult", "isAdult2", "Gross US & Canada", "Opening weekend US & Canada", "genres", "writers", "directors", "Rating"])
data = data.dropna()

52


In [5]:
def clean_unknowns(row, column):
    if row[column] == "\\N":
        return None
    else:
        return row[column]

def clean_reviews(row, column):
    if isinstance(row[column], str) and "K" in row[column]:
        # print(row[column], end=" -> ")
        if "." in row[column]:
            # print(int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100)
            return int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100
        else:
            # print(int(row[column][:-1]) * 1000)
            return int(row[column][:-1]) * 1000
    else:
        return row[column]

# Just an example of problematic data types
# print("Problematic form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

data["startYear"] = data.apply(lambda row: clean_unknowns(row, "startYear"), axis=1)
data["runtimeMinutes"] = data.apply(lambda row: clean_unknowns(row, "runtimeMinutes"), axis=1)
data["User reviews"] = data.apply(lambda row: clean_reviews(row, "User reviews"), axis=1)
data["Critic reviews"] = data.apply(lambda row: clean_reviews(row, "Critic reviews"), axis=1)

for column in ["startYear", "runtimeMinutes", "User reviews", "Critic reviews"]:
    data[column] = pd.to_numeric(data[column])

# print("Resolved form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

In [6]:
print("All features present:")
filtered = data.dropna()  # data.dropna()
print(len(filtered))
print()

All features present:
13624



In [7]:
filtered.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,startYear,runtimeMinutes,averageRating,numVotes,Budget,Gross worldwide,User reviews,Critic reviews
startYear,1.0,-0.056859,-0.209853,-0.056398,-0.09741,-0.100112,-0.00755,0.072536
runtimeMinutes,-0.056859,1.0,0.308394,0.252949,0.277725,0.21667,0.255099,0.20125
averageRating,-0.209853,0.308394,1.0,0.34828,0.115826,0.202252,0.253301,0.320207
numVotes,-0.056398,0.252949,0.34828,1.0,0.509107,0.617186,0.7587,0.649738
Budget,-0.09741,0.277725,0.115826,0.509107,1.0,0.631881,0.521016,0.516788
Gross worldwide,-0.100112,0.21667,0.202252,0.617186,0.631881,1.0,0.544276,0.470606
User reviews,-0.00755,0.255099,0.253301,0.7587,0.521016,0.544276,1.0,0.656517
Critic reviews,0.072536,0.20125,0.320207,0.649738,0.516788,0.470606,0.656517,1.0


In [8]:
test_indices = np.random.choice(len(filtered), replace=False, size=int(len(filtered) / 10))
test_set = filtered.iloc[test_indices]
test_set, test_targets = test_set.drop("averageRating", axis=1).to_numpy(), test_set["averageRating"].to_numpy()
# test_targets = (test_targets - 1) / 9
train_set = filtered.iloc[~test_indices]
train_set, train_targets = train_set.drop("averageRating", axis=1).to_numpy(), train_set["averageRating"].to_numpy()
# train_targets = (train_targets - 1) / 9

In [9]:
filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13624 entries, 201 to 110154
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   startYear        13624 non-null  int64  
 1   runtimeMinutes   13624 non-null  float64
 2   averageRating    13624 non-null  float64
 3   numVotes         13624 non-null  int64  
 4   Budget           13624 non-null  float64
 5   Gross worldwide  13624 non-null  float64
 6   User reviews     13624 non-null  float64
 7   Critic reviews   13624 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 957.9 KB


In [10]:
# from sklearn.linear_model import LinearRegression

# lr = LinearRegression().fit(train_set, train_targets)
# print(lr.predict(train_set) * 9 + 1)
# print(train_targets * 9 + 1)
# # print(lr.score(test_set, test_targets))

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor

test_set = torch.from_numpy(test_set)
test_set_normalized = (test_set - test_set.mean(dim=0, keepdims=True)) / test_set.std(dim=0, keepdims=True)
test_set_normalized = torch.nan_to_num(test_set_normalized, nan=0)
test_targets = torch.from_numpy(test_targets)

train_set = torch.from_numpy(train_set)
train_set_normalized = (train_set - train_set.mean(dim=0, keepdims=True)) / train_set.std(dim=0, keepdims=True)
train_set_normalized = torch.nan_to_num(train_set_normalized, nan=0)
train_targets = torch.from_numpy(train_targets)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(7, 1, dtype=torch.double)
    
    def forward(self, x):
        return torch.sigmoid(self.layer1(x))

loss_fn = nn.BCELoss()

model = Model()
opt = torch.optim.SGD(model.parameters(), lr=0.001)
    
for epoch in range(2000):
    pred = model(train_set_normalized)
    loss = loss_fn(pred.squeeze(), (train_targets - 1) / 9)
    if epoch % 100 == 99:
        print(f"Epoch {epoch + 1}, Training Loss {loss.item()}", end=", ")
        with torch.no_grad():
            pred = model(test_set_normalized)
            val_loss = loss_fn(pred.squeeze(), (test_targets - 1) / 9)
            print(f"Validation Loss {val_loss.item()}")
        print()
    
    opt.zero_grad()
    loss.backward()
    opt.step()

Epoch 100, Training Loss 0.7483490516260844, Validation Loss 0.7425193407063277

Epoch 200, Training Loss 0.7404589105502031, Validation Loss 0.7350049962551937

Epoch 300, Training Loss 0.7334093096397841, Validation Loss 0.7282799679887543

Epoch 400, Training Loss 0.727151253721029, Validation Loss 0.722303679851038

Epoch 500, Training Loss 0.7216285207970248, Validation Loss 0.7170277712890247

Epoch 600, Training Loss 0.7167796396044779, Validation Loss 0.7123974947996796

Epoch 700, Training Loss 0.7125402891582521, Validation Loss 0.7083537615434038

Epoch 800, Training Loss 0.7088457444439306, Validation Loss 0.7048355537906844

Epoch 900, Training Loss 0.705633044289826, Validation Loss 0.7017823370469876

Epoch 1000, Training Loss 0.7028426825175697, Validation Loss 0.6991361329486374

Epoch 1100, Training Loss 0.7004197565489396, Validation Loss 0.6968430427816644

Epoch 1200, Training Loss 0.698314607206945, Validation Loss 0.6948541707016005

Epoch 1300, Training Loss 0.6

In [12]:
train_targets

tensor([2.8000, 6.8000, 6.2000,  ..., 6.4000, 5.4000, 5.6000],
       dtype=torch.float64)