### Selecting columns, viualizing

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv("../dat/data_clean.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110155 entries, 0 to 110154
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   tconst                       110155 non-null  object 
 1   titleType                    110155 non-null  object 
 2   primaryTitle                 110155 non-null  object 
 3   originalTitle                110155 non-null  object 
 4   isAdult                      110155 non-null  int64  
 5   startYear                    110155 non-null  object 
 6   endYear                      110155 non-null  object 
 7   runtimeMinutes               110155 non-null  object 
 8   genres                       110155 non-null  object 
 9   averageRating                110155 non-null  float64
 10  numVotes                     110155 non-null  int64  
 11  Budget                       26636 non-null   float64
 12  Gross US & Canada            18139 non-null   float64
 13 

In [4]:
data["Critic reviews"] = data["Critic reviews"].fillna(0)
data["User reviews"] = data["User reviews"].fillna(0)

data["isAdult2"] = data.apply(lambda row: int("Adult" in row["genres"]), axis=1)
print(len(data[data["isAdult"] != data["isAdult2"]]))  # We use the one based on the genres

# tconst was only required for joins
# titleType is only films for us, we filtered them
# we do not use the titles as predictors
# endYear is None for all films
# isAdult will be added back in a consistent format later on

# We drop writers and directors. These are interesting features,
# but having them as binary columns would be infeasible.
data = data.drop(columns=[
    "tconst", "titleType", "primaryTitle", "originalTitle", "endYear",
    "isAdult", "isAdult2", "Gross US & Canada", "Opening weekend US & Canada",
    "writers", "directors"])
data = data.dropna()

52


In [5]:
genre_list = data["genres"].unique().tolist()
for i, entry in enumerate(genre_list):
    genre_list[i] = entry.split(",")

genre_set = set(itertools.chain(*genre_list))
print(genre_set)

{'Biography', 'Crime', 'Action', 'War', 'Drama', 'Animation', 'Family', 'History', 'Thriller', 'Fantasy', 'News', 'Romance', 'Film-Noir', 'Comedy', 'Western', 'Musical', 'Sci-Fi', 'Sport', 'Mystery', 'Adventure', 'Horror', 'Music', 'Documentary'}


In [6]:
# News - History - Biography - Documentary --> Documentary
# Film-Noir - Crime --> Crime
# Western - Action --> Action
genre_set.difference_update(["News", "History", "Biography", "Film-Noir", "Western"])
transformation_dict = {
    "Documentary":  ["News", "History", "Biography", "Documentary"],
    "Crime": ["Film-Noir", "Crime"],
    "Action": ["Western", "Action"]
}
for genre in genre_set:
    print(genre, end=" ")
    if genre not in transformation_dict:
        transformation_dict[genre] = [genre]
    data[f"is{genre}"] = data.apply(lambda row: int(any(g in row["genres"] for g in transformation_dict[genre])), axis=1)

Crime Action War Drama Animation Family Thriller Fantasy Romance Comedy Musical Sci-Fi Sport Mystery Adventure Horror Music Documentary 

In [7]:
results = []
for genre in genre_set:
    results.append(data[f"is{genre}"].sum())

sum_results = sum(results)
for genre in genre_set:
    print(genre, data[f"is{genre}"].sum() / sum_results * 100, "% - ", data[f"is{genre}"].sum())

Crime 7.606529736358063 % -  1985
Action 9.89423666462293 % -  2582
War 0.8698651134273452 % -  227
Drama 21.915236051502145 % -  5719
Animation 1.9773145309625997 % -  516
Family 2.2685469037400368 % -  592
Thriller 6.64086450030656 % -  1733
Fantasy 2.8318516247700796 % -  739
Romance 6.763488657265482 % -  1765
Comedy 14.484978540772534 % -  3780
Musical 0.5019926425505824 % -  131
Sci-Fi 2.5444512568976085 % -  664
Sport 0.9158491722869404 % -  239
Mystery 3.5101164929491104 % -  916
Adventure 6.7443286327406495 % -  1760
Horror 4.356989576946659 % -  1137
Music 1.7550582464745552 % -  458
Documentary 4.418301655426119 % -  1153


In [8]:
# Genres are added as binary predictors, thus the genres column is no longer used.
data = data.drop(columns=["genres"])  # "isMusical", "isFilm-Noir", "isNews", "isSport", "genres"])

def unrated_to_not_rated(row):
    if row["Rating"] == "Unrated":
        return "Not Rated"
    else:
        return row["Rating"]

data["Rating"] = data.apply(unrated_to_not_rated, axis=1)

In [9]:
data[f"isRated"] = data.apply(lambda row: int(row["Rating"] != "Not Rated"), axis=1)

In [10]:
data = data.drop(columns=["Rating"])

In [11]:
def clean_unknowns(row, column):
    if row[column] == "\\N":
        return None
    else:
        return row[column]

def clean_reviews(row, column):
    if isinstance(row[column], str) and "K" in row[column]:
        # print(row[column], end=" -> ")
        if "." in row[column]:
            # print(int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100)
            return int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100
        else:
            # print(int(row[column][:-1]) * 1000)
            return int(row[column][:-1]) * 1000
    else:
        return row[column]

# Just an example of problematic data types
# print("Problematic form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

data["startYear"] = data.apply(lambda row: clean_unknowns(row, "startYear"), axis=1)
data["runtimeMinutes"] = data.apply(lambda row: clean_unknowns(row, "runtimeMinutes"), axis=1)
data["User reviews"] = data.apply(lambda row: clean_reviews(row, "User reviews"), axis=1)
data["Critic reviews"] = data.apply(lambda row: clean_reviews(row, "Critic reviews"), axis=1)

for column in ["startYear", "runtimeMinutes", "User reviews", "Critic reviews"]:
    data[column] = pd.to_numeric(data[column])

# print("Resolved form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

In [12]:
print("All features present:")
filtered = data.dropna()  # data.dropna()
print(len(filtered))
print()

All features present:
10562



In [13]:
filtered.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,startYear,runtimeMinutes,averageRating,numVotes,Budget,Gross worldwide,User reviews,Critic reviews,isCrime,isAction,isWar,isDrama,isAnimation,isFamily,isThriller,isFantasy,isRomance,isComedy,isMusical,isSci-Fi,isSport,isMystery,isAdventure,isHorror,isMusic,isDocumentary,isRated
startYear,1.0,-0.030919,-0.204162,0.004507,-0.016892,-0.054407,0.064405,0.203674,-0.040582,0.066481,-0.054074,-0.020164,0.061577,-0.00732,0.039527,-0.005734,-0.0705,-0.027649,-0.081294,-0.019617,-0.001571,-0.010743,0.016889,0.01172,-0.049867,0.072176,-0.16177
runtimeMinutes,-0.030919,1.0,0.335605,0.261957,0.279417,0.221481,0.261547,0.193324,0.031509,0.154181,0.099536,0.249276,-0.190023,-0.075985,-0.021827,-0.008828,0.035345,-0.204311,0.069321,0.012908,0.023529,-0.005716,0.020706,-0.164164,0.044663,0.145047,-0.068609
averageRating,-0.204162,0.335605,1.0,0.377936,0.095791,0.210008,0.26231,0.325429,0.036321,-0.086782,0.074514,0.266948,0.019196,-0.063359,-0.086248,-0.051033,0.018034,-0.117211,0.018313,-0.038465,0.030436,-0.005095,-0.026922,-0.195235,0.03764,0.197378,0.009409
numVotes,0.004507,0.261957,0.377936,1.0,0.482263,0.603811,0.747314,0.6311,0.009865,0.141922,-0.004278,-0.057002,0.032794,-0.019361,0.006589,0.05618,-0.064572,-0.060488,-0.020248,0.160302,-0.020705,0.024755,0.172899,-0.031783,-0.040237,-0.035625,0.178357
Budget,-0.016892,0.279417,0.095791,0.482263,1.0,0.617877,0.491508,0.466338,-0.033721,0.286197,-0.019611,-0.186244,0.158217,0.07475,-0.027466,0.12137,-0.078424,-0.001235,-0.001885,0.1866,-0.015198,-0.03767,0.412365,-0.117544,-0.046347,-0.060535,0.260555
Gross worldwide,-0.054407,0.221481,0.210008,0.603811,0.617877,1.0,0.527258,0.445153,-0.060083,0.156225,-0.022851,-0.146848,0.152249,0.061009,-0.023754,0.084354,-0.048262,0.002667,-0.004118,0.119246,-0.023744,-0.030942,0.301996,-0.057679,-0.024625,-0.051698,0.162683
User reviews,0.064405,0.261547,0.26231,0.747314,0.491508,0.527258,1.0,0.633603,-0.017713,0.166815,-0.006438,-0.057636,-0.014993,-0.030697,0.027795,0.083787,-0.074213,-0.10504,-0.000914,0.187139,-0.031831,0.053149,0.17692,0.051391,-0.028153,-0.047079,0.177711
Critic reviews,0.203674,0.193324,0.325429,0.6311,0.466338,0.445153,0.633603,1.0,-0.008447,0.149408,-0.026427,-0.043671,0.021022,-0.050959,0.043138,0.088007,-0.085269,-0.097665,-0.023402,0.185863,-0.04396,0.081574,0.172428,0.10652,-0.04445,-0.009732,0.245953
isCrime,-0.040582,0.031509,0.036321,0.009865,-0.033721,-0.060083,-0.017713,-0.008447,1.0,0.156626,-0.067954,0.029759,-0.102283,-0.10669,0.118645,-0.11485,-0.158968,-0.097273,-0.029824,-0.100638,-0.066681,0.084262,-0.16179,-0.108445,-0.066726,-0.07126,0.034315
isAction,0.066481,0.154181,-0.086782,0.141922,0.286197,0.156225,0.166815,0.149408,0.156626,1.0,-0.022016,-0.225961,-0.037959,-0.098382,0.085272,0.01066,-0.212863,-0.18707,-0.057774,0.154005,-0.043591,-0.115795,0.268838,-0.094502,-0.115697,-0.109403,0.009663


In [14]:
test_indices = np.random.choice(len(filtered), replace=False, size=int(len(filtered) / 10))
test_set = filtered.iloc[test_indices]
test_set, test_targets = test_set.drop("averageRating", axis=1).to_numpy(), test_set["averageRating"].to_numpy()
# test_targets = (test_targets - 1) / 9
train_set = filtered.iloc[~test_indices]
train_set, train_targets = train_set.drop("averageRating", axis=1).to_numpy(), train_set["averageRating"].to_numpy()
# train_targets = (train_targets - 1) / 9

In [15]:
filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10562 entries, 201 to 110154
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   startYear        10562 non-null  int64  
 1   runtimeMinutes   10562 non-null  int64  
 2   averageRating    10562 non-null  float64
 3   numVotes         10562 non-null  int64  
 4   Budget           10562 non-null  float64
 5   Gross worldwide  10562 non-null  float64
 6   User reviews     10562 non-null  float64
 7   Critic reviews   10562 non-null  float64
 8   isCrime          10562 non-null  int64  
 9   isAction         10562 non-null  int64  
 10  isWar            10562 non-null  int64  
 11  isDrama          10562 non-null  int64  
 12  isAnimation      10562 non-null  int64  
 13  isFamily         10562 non-null  int64  
 14  isThriller       10562 non-null  int64  
 15  isFantasy        10562 non-null  int64  
 16  isRomance        10562 non-null  int64  
 17  isComedy 

In [16]:
# from sklearn.linear_model import LinearRegression

# lr = LinearRegression().fit(train_set, train_targets)
# print(lr.predict(train_set) * 9 + 1)
# print(train_targets * 9 + 1)
# # print(lr.score(test_set, test_targets))

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor

test_set = torch.from_numpy(test_set)
test_set_normalized = (test_set - test_set.mean(dim=0, keepdims=True)) / test_set.std(dim=0, keepdims=True)
test_set_normalized = torch.nan_to_num(test_set_normalized, nan=0)
test_targets = torch.from_numpy(test_targets)

train_set = torch.from_numpy(train_set)
train_set_normalized = (train_set - train_set.mean(dim=0, keepdims=True)) / train_set.std(dim=0, keepdims=True)
train_set_normalized = torch.nan_to_num(train_set_normalized, nan=0)
train_targets = torch.from_numpy(train_targets)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(26, 1, dtype=torch.double)

    def forward(self, x):
        return torch.sigmoid(self.layer1(x)) * 9 + 1

loss_fn = nn.MSELoss()

model = Model()
opt = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(2000):
    pred = model(train_set_normalized)
    loss = loss_fn(pred.squeeze(), train_targets)
    if epoch % 100 == 99:
        print(f"Epoch {epoch + 1}, Training Loss {loss.item():.4f}", end=", ")
        with torch.no_grad():
            pred = model(test_set_normalized)
            val_loss = loss_fn(pred.squeeze(), test_targets)
            print(f"Validation Loss {val_loss.item():.4f}")
        print()

    opt.zero_grad()
    loss.backward()
    opt.step()

Epoch 100, Training Loss 1.1322, Validation Loss 1.0278

Epoch 200, Training Loss 0.8259, Validation Loss 0.7778

Epoch 300, Training Loss 0.7561, Validation Loss 0.7282

Epoch 400, Training Loss 0.7313, Validation Loss 0.7111

Epoch 500, Training Loss 0.7196, Validation Loss 0.7026

Epoch 600, Training Loss 0.7131, Validation Loss 0.6976

Epoch 700, Training Loss 0.7091, Validation Loss 0.6944

Epoch 800, Training Loss 0.7065, Validation Loss 0.6922

Epoch 900, Training Loss 0.7047, Validation Loss 0.6907

Epoch 1000, Training Loss 0.7035, Validation Loss 0.6896

Epoch 1100, Training Loss 0.7026, Validation Loss 0.6888

Epoch 1200, Training Loss 0.7019, Validation Loss 0.6881

Epoch 1300, Training Loss 0.7014, Validation Loss 0.6876

Epoch 1400, Training Loss 0.7011, Validation Loss 0.6872

Epoch 1500, Training Loss 0.7008, Validation Loss 0.6868

Epoch 1600, Training Loss 0.7006, Validation Loss 0.6866

Epoch 1700, Training Loss 0.7004, Validation Loss 0.6863

Epoch 1800, Training Lo

In [18]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(26, 32, dtype=torch.double)
        self.layer2 = nn.Linear(32, 1, dtype=torch.double)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
    
        return torch.sigmoid(self.layer2(x)) * 9 + 1

loss_fn = nn.MSELoss()

model = Model()
opt = torch.optim.Adam(model.parameters(), lr=0.001)
    
for epoch in range(5000):
    pred = model(train_set_normalized)
    loss = loss_fn(pred.squeeze(), train_targets)
    if epoch % 100 == 99:
        print(f"Epoch {epoch + 1}, Training Loss {loss.item():.4f}", end=", ")
        with torch.no_grad():
            pred = model(test_set_normalized)
            val_loss = loss_fn(pred.squeeze(), test_targets)
            print(f"Validation Loss {val_loss.item():.4f}")
        print()

    opt.zero_grad()
    loss.backward()
    opt.step()

Epoch 100, Training Loss 0.6814, Validation Loss 0.7304

Epoch 200, Training Loss 0.5810, Validation Loss 0.6745

Epoch 300, Training Loss 0.5299, Validation Loss 0.6647

Epoch 400, Training Loss 0.4900, Validation Loss 0.6601

Epoch 500, Training Loss 0.4543, Validation Loss 0.6558

Epoch 600, Training Loss 0.4229, Validation Loss 0.6583

Epoch 700, Training Loss 0.3959, Validation Loss 0.6620

Epoch 800, Training Loss 0.3724, Validation Loss 0.6767

Epoch 900, Training Loss 0.3482, Validation Loss 0.6914

Epoch 1000, Training Loss 0.3265, Validation Loss 0.7060

Epoch 1100, Training Loss 0.3075, Validation Loss 0.7243

Epoch 1200, Training Loss 0.2924, Validation Loss 0.7407

Epoch 1300, Training Loss 0.2804, Validation Loss 0.7616

Epoch 1400, Training Loss 0.2670, Validation Loss 0.7815

Epoch 1500, Training Loss 0.2560, Validation Loss 0.8018

Epoch 1600, Training Loss 0.2464, Validation Loss 0.8291

Epoch 1700, Training Loss 0.2378, Validation Loss 0.8493

Epoch 1800, Training Lo