### Selecting columns, viualizing

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv("data_clean.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110155 entries, 0 to 110154
Data columns (total 20 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   tconst                       110155 non-null  object 
 1   titleType                    110155 non-null  object 
 2   primaryTitle                 110155 non-null  object 
 3   originalTitle                110155 non-null  object 
 4   isAdult                      110155 non-null  int64  
 5   startYear                    110155 non-null  object 
 6   endYear                      110155 non-null  object 
 7   runtimeMinutes               110155 non-null  object 
 8   genres                       110155 non-null  object 
 9   averageRating                110155 non-null  float64
 10  numVotes                     110155 non-null  int64  
 11  Budget                       26636 non-null   float64
 12  Gross US & Canada            18139 non-null   float64
 13 

In [4]:
data["Critic reviews"] = data["Critic reviews"].fillna(0)
data["User reviews"] = data["User reviews"].fillna(0)

data["isAdult2"] = data.apply(lambda row: int("Adult" in row["genres"]), axis=1)
print(len(data[data["isAdult"] != data["isAdult2"]]))  # We use the one based on the genres

# tconst was only required for joins
# titleType is only films for us, we filtered them
# we do not use the titles as predictors
# endYear is None for all films
# isAdult will be added back in a consistent format later on
data = data.drop(columns=["tconst", "titleType", "primaryTitle", "originalTitle", "endYear", "isAdult", "isAdult2", "Gross US & Canada", "Opening weekend US & Canada"])
data = data.dropna()

52


In [5]:
genre_list = data["genres"].unique().tolist()
for i, entry in enumerate(genre_list):
    genre_list[i] = entry.split(",")

genre_set = set(itertools.chain(*genre_list))
print(genre_set)

{'Family', 'News', 'Documentary', 'Music', 'Action', 'History', 'Western', 'Musical', 'Adventure', 'Fantasy', 'Biography', 'Sci-Fi', 'Mystery', 'Film-Noir', 'Thriller', 'War', 'Animation', 'Romance', 'Crime', 'Drama', 'Horror', 'Comedy', 'Sport'}


In [6]:
for genre in genre_set:
    print(genre, end=" ")
    data[f"is{genre}"] = data.apply(lambda row: int(genre in row["genres"]), axis=1)

Family News Documentary Music Action History Western Musical Adventure Fantasy Biography Sci-Fi Mystery Film-Noir Thriller War Animation Romance Crime Drama Horror Comedy Sport 

In [7]:
results = []
for genre in genre_set:
    results.append(data[f"is{genre}"].sum())

sum_results = sum(results)
for genre in genre_set:
    print(genre, data[f"is{genre}"].sum() / sum_results * 100, "%")

Family 2.2494110494718447 %
News 0.022798084960863288 %
Documentary 1.0487119081997112 %
Music 1.740253818679231 %
Action 9.514400790333612 %
History 1.4590774374952504 %
Western 0.35716999772019153 %
Musical 0.49775818831218177 %
Adventure 6.6874382551865645 %
Fantasy 2.807964131012995 %
Biography 2.587582643057983 %
Sci-Fi 2.5229880690022037 %
Mystery 3.480507637358462 %
Film-Noir 0.08359297818983205 %
Thriller 6.584846872862679 %
War 0.8625275476859944 %
Animation 1.9606353066342428 %
Romance 6.706436659320617 %
Crime 7.504369632950832 %
Drama 21.730374648529523 %
Horror 4.3202371000835935 %
Comedy 14.36279352534387 %
Sport 0.9081237176077209 %


In [8]:
# We drop writers and directors. These are interesting features,
# but having them as binary columns would be infeasible.
# Genres are added as binary predictors, thus the genres column is no longer used.
data = data.drop(columns=["genres"])  # "isMusical", "isFilm-Noir", "isNews", "isSport", "genres"])

def unrated_to_not_rated(row):
    if row["Rating"] == "Unrated":
        return "Not Rated"
    else:
        return row["Rating"]

data["Rating"] = data.apply(unrated_to_not_rated, axis=1)

In [9]:
data[f"isRated"] = data.apply(lambda row: int(row["Rating"] != "Not Rated"), axis=1)

In [10]:
data = data.drop(columns=["Rating"])

In [11]:
def clean_unknowns(row, column):
    if row[column] == "\\N":
        return None
    else:
        return row[column]

def clean_reviews(row, column):
    if isinstance(row[column], str) and "K" in row[column]:
        # print(row[column], end=" -> ")
        if "." in row[column]:
            # print(int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100)
            return int(row[column][:-3]) * 1000 + int(row[column][-2]) * 100
        else:
            # print(int(row[column][:-1]) * 1000)
            return int(row[column][:-1]) * 1000
    else:
        return row[column]

# Just an example of problematic data types
# print("Problematic form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

data["startYear"] = data.apply(lambda row: clean_unknowns(row, "startYear"), axis=1)
data["runtimeMinutes"] = data.apply(lambda row: clean_unknowns(row, "runtimeMinutes"), axis=1)
data["User reviews"] = data.apply(lambda row: clean_reviews(row, "User reviews"), axis=1)
data["Critic reviews"] = data.apply(lambda row: clean_reviews(row, "Critic reviews"), axis=1)

for column in ["startYear", "runtimeMinutes", "User reviews", "Critic reviews"]:
    data[column] = pd.to_numeric(data[column])

# print("Resolved form")
# print(data.startYear.unique())
# print(data.runtimeMinutes.unique())
# print(data["User reviews"].unique())
# print(data["Critic reviews"].unique())

In [12]:
print("All features present:")
filtered = data.dropna()  # data.dropna()
print(len(filtered))
print()

All features present:
10562



In [13]:
filtered.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,startYear,runtimeMinutes,averageRating,numVotes,Budget,Gross worldwide,User reviews,Critic reviews,isFamily,isNews,isDocumentary,isMusic,isAction,isHistory,isWestern,isMusical,isAdventure,isFantasy,isBiography,isSci-Fi,isMystery,isFilm-Noir,isThriller,isWar,isAnimation,isRomance,isCrime,isDrama,isHorror,isComedy,isSport,isRated
startYear,1.0,-0.030919,-0.204162,0.004507,-0.016892,-0.054407,0.064405,0.203674,-0.00732,0.013105,0.06839,-0.049867,0.084793,0.01842,-0.084198,-0.081294,0.016889,-0.005734,0.048861,-0.019617,-0.010743,-0.180182,0.039527,-0.054074,0.061577,-0.0705,-0.030923,-0.020164,0.01172,-0.027649,-0.001571,-0.16177
runtimeMinutes,-0.030919,1.0,0.335605,0.261957,0.279417,0.221481,0.261547,0.193324,-0.075985,-0.027478,-0.095144,0.044663,0.147309,0.175619,0.040377,0.069321,0.020706,-0.008828,0.150809,0.012908,-0.005716,-0.012372,-0.021827,0.099536,-0.190023,0.035345,0.031751,0.249276,-0.164164,-0.204311,0.023529,-0.068609
averageRating,-0.204162,0.335605,1.0,0.377936,0.095791,0.210008,0.26231,0.325429,-0.063359,0.021166,0.11424,0.03764,-0.092814,0.092044,0.0231,0.018313,-0.026922,-0.051033,0.156283,-0.038465,-0.005095,0.066436,-0.086248,0.074514,0.019196,0.018034,0.032985,0.266948,-0.195235,-0.117211,0.030436,0.009409
numVotes,0.004507,0.261957,0.377936,1.0,0.482263,0.603811,0.747314,0.6311,-0.019361,-0.008679,-0.067302,-0.040237,0.143369,-0.022566,-0.001119,-0.020248,0.172899,0.05618,0.006405,0.160302,0.024755,-0.000449,0.006589,-0.004278,0.032794,-0.064572,0.010013,-0.057002,-0.031783,-0.060488,-0.020705,0.178357
Budget,-0.016892,0.279417,0.095791,0.482263,1.0,0.617877,0.491508,0.466338,0.07475,-0.016945,-0.106829,-0.046347,0.289672,0.000557,0.002847,-0.001885,0.412365,0.12137,-0.025148,0.1866,-0.03767,-0.019419,-0.027466,-0.019611,0.158217,-0.078424,-0.033099,-0.186244,-0.117544,-0.001235,-0.015198,0.260555
Gross worldwide,-0.054407,0.221481,0.210008,0.603811,0.617877,1.0,0.527258,0.445153,0.061009,-0.008292,-0.056873,-0.024625,0.159052,-0.014252,-0.007663,-0.004118,0.301996,0.084354,-0.034011,0.119246,-0.030942,-0.017631,-0.023754,-0.022851,0.152249,-0.048262,-0.059265,-0.146848,-0.057679,0.002667,-0.023744,0.162683
User reviews,0.064405,0.261547,0.26231,0.747314,0.491508,0.527258,1.0,0.633603,-0.030697,-0.007224,-0.067261,-0.028153,0.170079,-0.018966,-0.007986,-0.000914,0.17692,0.083787,-0.014381,0.187139,0.053149,0.005065,0.027795,-0.006438,-0.014993,-0.074213,-0.017827,-0.057636,0.051391,-0.10504,-0.031831,0.177711
Critic reviews,0.203674,0.193324,0.325429,0.6311,0.466338,0.445153,0.633603,1.0,-0.050959,-0.014803,-0.0876,-0.04445,0.153866,-0.013767,-0.01171,-0.023402,0.172428,0.088007,0.05009,0.185863,0.081574,0.006148,0.043138,-0.026427,0.021022,-0.085269,-0.008673,-0.043671,0.10652,-0.097665,-0.04396,0.245953
isFamily,-0.00732,-0.075985,-0.063359,-0.019361,0.07475,0.061009,-0.030697,-0.050959,1.0,-0.00581,-0.011533,-0.013482,-0.097123,-0.036336,-0.014326,0.002445,0.147296,0.130022,-0.037156,-0.046155,-0.073628,-0.011133,-0.106847,-0.036114,0.099443,-0.080463,-0.106306,-0.111975,-0.084636,0.066228,0.021047,0.046875
isNews,0.013105,-0.027478,0.021166,-0.008679,-0.016945,-0.008292,-0.007224,-0.014803,-0.00581,1.0,0.145544,-0.005076,-0.01329,0.037827,-0.002259,-0.002672,-0.010661,-0.006539,-0.006259,-0.006175,-0.007347,-0.001089,-0.010563,-0.003533,-0.005403,-0.010679,-0.011434,-0.025908,-0.008281,-0.017799,-0.003628,-0.042387
