In [141]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

In [142]:
#Importamos los datasets.

credits_df = pd.read_csv("credits_df.csv")

titles_df = pd.read_csv("titles_df.csv")

In [143]:
credits_df.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [144]:
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77801 entries, 0 to 77800
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   person_id  77801 non-null  int64 
 1   id         77801 non-null  object
 2   name       77801 non-null  object
 3   character  68029 non-null  object
 4   role       77801 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.0+ MB


In [145]:
#Eliminamos los espacios en blanco al inicio y al final de las columnas 'name' y 'role' y lo dejamos todo en minuscula en el dataset de credits_df.

credits_df["name"] = credits_df["name"].str.lower().str.strip()

credits_df["role"] = credits_df["role"].str.lower().str.strip()

In [146]:
#Nos quedamos con todos los id que empiezan por 'tm', ya que son los id que identifican a las películas en el dataset de credits_df.

credits_df = credits_df[credits_df["id"].str.startswith("tm") == True]

credits_df = credits_df.reset_index(drop = True)

In [147]:
#Eliminamos la columna character, ya que no es relevante para el objetivo del proyecto en el dataset de credits_df.

columns_to_drop_credits = ["character"]

credits_df = credits_df.drop(columns = columns_to_drop_credits, axis = 1)

In [148]:
credits_df

Unnamed: 0,person_id,id,name,role
0,3748,tm84618,robert de niro,actor
1,14658,tm84618,jodie foster,actor
2,7064,tm84618,albert brooks,actor
3,3739,tm84618,harvey keitel,actor
4,48933,tm84618,cybill shepherd,actor
...,...,...,...,...
63086,736339,tm1059008,adelaida buscato,actor
63087,399499,tm1059008,luz stella luengas,actor
63088,373198,tm1059008,inés prieto,actor
63089,378132,tm1059008,isabel gaona,actor


In [149]:
titles_df.head(2)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179


In [150]:
titles_df.shape

(5850, 15)

In [151]:
titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5850 entries, 0 to 5849
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5850 non-null   object 
 1   title                 5849 non-null   object 
 2   type                  5850 non-null   object 
 3   description           5832 non-null   object 
 4   release_year          5850 non-null   int64  
 5   age_certification     3231 non-null   object 
 6   runtime               5850 non-null   int64  
 7   genres                5850 non-null   object 
 8   production_countries  5850 non-null   object 
 9   seasons               2106 non-null   float64
 10  imdb_id               5447 non-null   object 
 11  imdb_score            5368 non-null   float64
 12  imdb_votes            5352 non-null   float64
 13  tmdb_popularity       5759 non-null   float64
 14  tmdb_score            5539 non-null   float64
dtypes: float64(5), int64(

In [152]:
titles_df.nunique()

id                      5850
title                   5798
type                       2
description             5829
release_year              63
age_certification         11
runtime                  202
genres                  1726
production_countries     452
seasons                   26
imdb_id                 5447
imdb_score                80
imdb_votes              3880
tmdb_popularity         4889
tmdb_score               394
dtype: int64

In [153]:
titles_df["age_certification"].unique()

array(['TV-MA', 'R', 'PG', nan, 'TV-14', 'PG-13', 'TV-PG', 'TV-Y', 'TV-G',
       'TV-Y7', 'G', 'NC-17'], dtype=object)

In [154]:
titles_df["title"].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
5845    False
5846    False
5847    False
5848    False
5849    False
Name: title, Length: 5850, dtype: bool

In [155]:
titles_df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5845    False
5846    False
5847    False
5848    False
5849    False
Length: 5850, dtype: bool

In [156]:
#Eliminamos SHOW de la columna 'type' en el dataset de titles_df.

new_type = ["MOVIE"]

titles_df = titles_df[titles_df["type"].isin(new_type) == True]

In [157]:
#Eliminamos las columnas 'type', 'seasons', 'age_certification' e 'imdb_id' del dataset titles_df porque no son relevantes para el objetivo del proyecto.

columns_to_drop = ["type", "age_certification", "seasons", "imdb_id", "tmdb_popularity"]

titles_df = titles_df.drop(columns = columns_to_drop, axis = 1)

In [158]:
titles_df.nunique()

id                      3744
title                   3724
description             3734
release_year              62
runtime                  197
genres                  1105
production_countries     418
imdb_score                74
imdb_votes              2747
tmdb_score               266
dtype: int64

In [159]:
#Eliminamos el único título nulo que existe en el dataset de titles_df.

titles_df = titles_df.dropna(subset = ["title"])

titles_df["title"].isnull().sum()

0

In [160]:
#Descartamos que haya títulos duplicados que tengan el mismo año de lanzamiento.

grouped = titles_df[["title", "release_year"]]

grouped.duplicated().sum()

0

In [161]:
""" Eliminamos las filas que no tienen género:
- Creamos un nuevo dataframe solo con las columnas 'title' y 'genres'.
- Creamos una lista vacía para añadir el id de los registros sin género.
- Quitamos corchetes y comillas de los géneros y creamos una lista nueva que tome cada palabra por separado.
- Si la lista está vacía, la añadimos a la nueva lista.
- Reseteamos el índice.
- Eliminamos las filas sin género del dataframe principal.

"""
grouped_2 = titles_df[["title", "genres"]].reset_index(drop = True)
grouped_2

id_to_erase = []

for name in range(0, len(grouped_2)):
    l = grouped_2.iloc[name]["genres"].replace('[', "").replace("]", "").replace("'", "").split(", ")
    if l[0] == "":
        id_to_erase.append(name)

titles_df.reset_index(drop = True, inplace = True)

titles_df = titles_df.drop(id_to_erase, axis = 0).dropna(subset = "genres")

titles_df = titles_df.reset_index(drop = True)


In [162]:
#Transformamos los datos de 'genres' a una lista.

list_genre = []

for name in range(0, len(titles_df)):
    list_genre.append(titles_df.iloc[name]["genres"].replace('[', "").replace("]", "").replace("'", "").split(", "))

titles_df["genres"] = list_genre

In [163]:
#Transformamos los datos de production_countries a una lista.

list_country = []

for name in range(0, len(titles_df)):
    list_country.append(titles_df.iloc[name]["production_countries"].replace('[', "").replace("]", "").replace("'", "").split(", "))

titles_df["production_countries"] = list_country

In [164]:
#Eliminamos los espacios en blanco al inicio y al final y lo dejamos todo en minuscula en las columnas 'title' y 'description' en el dataset de titles_df.

titles_df["title"] = titles_df["title"].str.lower().str.strip()

titles_df["description"] = titles_df["description"].str.lower().str.strip()

In [165]:
#Comprobamos que todos los anyos están compuestos por 4 dígitos.

id_to_erase = []

for name in range(0, len(titles_df)):
    year = str(titles_df.iloc[name]["release_year"])
    
    if len(year) != 4:
        id_to_erase.append(name)


id_to_erase

[]

In [166]:
credits_df

Unnamed: 0,person_id,id,name,role
0,3748,tm84618,robert de niro,actor
1,14658,tm84618,jodie foster,actor
2,7064,tm84618,albert brooks,actor
3,3739,tm84618,harvey keitel,actor
4,48933,tm84618,cybill shepherd,actor
...,...,...,...,...
63086,736339,tm1059008,adelaida buscato,actor
63087,399499,tm1059008,luz stella luengas,actor
63088,373198,tm1059008,inés prieto,actor
63089,378132,tm1059008,isabel gaona,actor


In [167]:
#Extraemos 1 vez cada id y lo metemos en una lista.

id=credits_df.id.unique()

In [168]:
#Creamos un nuevo data-frame con el id de la película y los nombres de actores y directores.

df_unique_film = pd.DataFrame(index = id, columns = ["names","directors"])

In [169]:
#Rellenamos las columnas de nobres de actores y de directores

for i in df_unique_film.index:
    df_unique_film.loc[i,"names"] = list(credits_df[(credits_df["id"] == i) & (credits_df["role"] == "actor")]["name"])
    df_unique_film.loc[i,"directors"] = list(credits_df[(credits_df["id"] == i) & (credits_df["role"] == "director")]["name"])

In [170]:
df_unique_film

Unnamed: 0,names,directors
tm84618,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese]
tm154986,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman]
tm127384,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]"
tm120801,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich]
tm70993,"[graham chapman, john cleese, terry gilliam, e...",[terry jones]
...,...,...
tm1216735,[],[joe penney]
tm985215,"[maymay entrata, edward barber, snooky serna, ...",[barry gonzalez]
tm1097142,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir]
tm1014599,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope]


In [171]:
""" Esta alternativa para crear nuevas columnas con los actores y directores tardaba mucho en ejecutarse, así que se descartó al encontrar la anternativa anterior.
df = credits_df.head(1000)
diccionario_actores = {}
diccionario_directores = {}
for id in df["id"]:
    list_actors = []
    list_director = []
    for i in range(0, len(df)):
        if df.iloc[i]["id"] == id:
            if df.iloc[i]["role"] == "actor":
                list_actors.append(df.iloc[i]["name"])
            elif df.iloc[i]["role"] == "director":
                list_director.append(df.iloc[i]["name"])
    diccionario_actores[id] = list_actors
    diccionario_directores[id] = list_director    
"""

' Esta alternativa para crear nuevas columnas con los actores y directores tardaba mucho en ejecutarse, así que se descartó al encontrar la anternativa anterior.\ndf = credits_df.head(1000)\ndiccionario_actores = {}\ndiccionario_directores = {}\nfor id in df["id"]:\n    list_actors = []\n    list_director = []\n    for i in range(0, len(df)):\n        if df.iloc[i]["id"] == id:\n            if df.iloc[i]["role"] == "actor":\n                list_actors.append(df.iloc[i]["name"])\n            elif df.iloc[i]["role"] == "director":\n                list_director.append(df.iloc[i]["name"])\n    diccionario_actores[id] = list_actors\n    diccionario_directores[id] = list_director    \n'

In [172]:
#Reseteamos el índice del dataset original.

titles_df.set_index('id', inplace = True)

In [173]:
#Unimos el nuevo dataset al original de titles_df para tener toda la información relevante para el objetivo del proyecto en un mismo dataframe.

total_df = pd.merge(titles_df, df_unique_film, how = 'left', left_index = True, right_index = True)

In [174]:
total_df

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_score,imdb_votes,tmdb_score,names,directors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tm84618,taxi driver,a mentally unstable vietnam war veteran works ...,1976,114,"[drama, crime]",[US],8.2,808582.0,8.179,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese]
tm154986,deliverance,intent on seeing the cahulawassee river before...,1972,109,"[drama, action, thriller, european]",[US],7.7,107673.0,7.300,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman]
tm127384,monty python and the holy grail,"king arthur, accompanied by his squire, recrui...",1975,91,"[fantasy, action, comedy]",[GB],8.2,534486.0,7.811,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]"
tm120801,the dirty dozen,12 american military prisoners in world war ii...,1967,150,"[war, action]","[GB, US]",7.7,72662.0,7.600,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich]
tm70993,life of brian,"brian cohen is an average young jewish man, bu...",1979,94,[comedy],[GB],8.0,395024.0,7.800,"[graham chapman, john cleese, terry gilliam, e...",[terry jones]
...,...,...,...,...,...,...,...,...,...,...,...
tm1066324,super monsters: once upon a rhyme,the super monsters rethink exemplary fantasies...,2021,25,"[animation, family]",[],5.6,38.0,6.300,,
tm1097142,my bride,the story follows a young man and woman who go...,2021,93,"[romance, comedy, drama]",[EG],5.0,327.0,5.300,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir]
tm1014599,fine wine,a beautiful love story that can happen between...,2021,100,"[romance, drama]",[NG],6.8,45.0,,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope]
tm898842,c/o kaadhal,a heart warming film that explores the concept...,2021,134,[drama],[],7.7,348.0,,,


In [175]:
#Redondeamos tmdb_score a un dígito.

total_df["tmdb_score"] = total_df["tmdb_score"].round(1)

In [176]:
#Calculamos la media de los dos tipos de valoraciones incluidos en el dataset y creamos una nueva columna con ese valor.

total_df["score"] = total_df[['imdb_score', 'tmdb_score']].mean(axis = 1).round(1)

columns_to_drop = ['imdb_score', 'tmdb_score']

total_df = total_df.drop(columns = columns_to_drop, axis = 1)

In [177]:
total_df.isnull().sum()

title                     0
description               5
release_year              0
runtime                   0
genres                    0
production_countries      0
imdb_votes              297
names                    84
directors                84
score                    40
dtype: int64

In [178]:
#Convertirmos los valores de la columna 'production_countries' en una lista.

paises = total_df["production_countries"].apply(lambda x: ','.join(x)).str.get_dummies(sep = ",")

In [179]:
#Sumamos cada uno de los países para conocer cuáles son los principales mercados de producción de películas para Netflix.

pais_total = paises.sum()

In [180]:
#Tomamos tan solo los 15 principales países.

pais_15 = pais_total.sort_values(ascending = False).head(15)

In [181]:
total_df_not_null = total_df.dropna(subset = "directors")

In [182]:
total_df["genres"]

id
tm84618                           [drama, crime]
tm154986     [drama, action, thriller, european]
tm127384               [fantasy, action, comedy]
tm120801                           [war, action]
tm70993                                 [comedy]
                            ...                 
tm1066324                    [animation, family]
tm1097142               [romance, comedy, drama]
tm1014599                       [romance, drama]
tm898842                                 [drama]
tm1059008                               [comedy]
Name: genres, Length: 3710, dtype: object

In [217]:
total_df_not_null

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_votes,names,directors,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tm84618,taxi driver,a mentally unstable vietnam war veteran works ...,1976,114,"[drama, crime]",[US],808582.0,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese],8.2
tm154986,deliverance,intent on seeing the cahulawassee river before...,1972,109,"[drama, action, thriller, european]",[US],107673.0,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman],7.5
tm127384,monty python and the holy grail,"king arthur, accompanied by his squire, recrui...",1975,91,"[fantasy, action, comedy]",[GB],534486.0,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]",8.0
tm120801,the dirty dozen,12 american military prisoners in world war ii...,1967,150,"[war, action]","[GB, US]",72662.0,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich],7.6
tm70993,life of brian,"brian cohen is an average young jewish man, bu...",1979,94,[comedy],[GB],395024.0,"[graham chapman, john cleese, terry gilliam, e...",[terry jones],7.9
...,...,...,...,...,...,...,...,...,...,...
tm1165179,kongsi raya,jack - a chinese chef-manager who is in-line t...,2022,102,[comedy],[MY],66.0,"[ai leng ong, chew kin-wah, harith iskander, e...",[teddy chan],7.0
tm985215,princess 'daya'reese,reese is a con artist from manila who dreams o...,2021,115,"[comedy, romance]",[PH],50.0,"[maymay entrata, edward barber, snooky serna, ...",[barry gonzalez],7.1
tm1097142,my bride,the story follows a young man and woman who go...,2021,93,"[romance, comedy, drama]",[EG],327.0,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir],5.2
tm1014599,fine wine,a beautiful love story that can happen between...,2021,100,"[romance, drama]",[NG],45.0,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope],6.8


In [183]:
actors = total_df_not_null["names"].apply(lambda x: ','.join(x)).str.get_dummies(sep = ",")

In [216]:
actors

Unnamed: 0_level_0,jr.,the creator,'jeeva' ravi,'weird al' yankovic,21 savage,2mex,50 cent,50-grand,87gongzhu,9m88,...,şebnem sönmez,şebnem türkan bilgeer,şenay bozoklar,şenay gürler,şevket çoruh,şinasi yurtsever,şükrü özyıldız,štepánka fingerhutová,игорь павлов,麥沛東
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tm84618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm154986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm127384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm120801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm70993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tm1165179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm985215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm1097142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm1014599,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
dire = total_df_not_null["directors"].apply(lambda x: ','.join(x)).str.get_dummies(sep = ",")

In [224]:
dire_sum = dire.sum()

In [223]:
actor_sum = actors.sum()

In [215]:
actor_sum

 jr.                     6
 the creator             1
'jeeva' ravi             1
'weird al' yankovic      2
21 savage                1
                        ..
şinasi yurtsever         3
şükrü özyıldız           1
štepánka fingerhutová    1
игорь павлов             1
麥沛東                      1
Length: 43582, dtype: int64

In [225]:
dire_sum_25 = dire_sum.sort_values(ascending = False).head(25)

In [226]:
actor_sum_30 = actor_sum.sort_values(ascending = False).head(30)

In [189]:
generos_df = total_df["genres"].apply(lambda x: ','.join(x)).str.get_dummies(sep = ",")

In [190]:
generos_sum = generos_df.sum()

In [191]:
generos_sum

action            718
animation         282
comedy           1571
crime             545
documentation     611
drama            1876
european          344
family            351
fantasy           315
history           145
horror            261
music             191
reality             9
romance           698
scifi             204
sport             113
thriller          825
war               103
western            28
dtype: int64

In [192]:
#En este gráfico circular podemos observar los principales mercados de producción de películas para Netflix.

fig = go.Figure(data = [go.Pie(labels = list(pais_15.index), values = list(pais_15.values))])

fig.show()

In [193]:
#En este gráfico circular podemos observar los principales directores de películas para Netflix.
fig = go.Figure(data = [go.Pie(labels = list(dire_sum_25.index), values = list(dire_sum_25.values))])
fig.show()

In [194]:
#En este gráfico circular podemos observar los actores que en mas películas de Netflix aparecen.

fig = go.Figure(data = [go.Pie(labels = list(actor_sum_30.index), values = list(actor_sum_30.values))])
fig.show()

In [195]:
#En este gráfico circular podemos observar los principales generos de películas para Netflix.

fig = go.Figure(data = [go.Pie(labels = list(generos_sum.index), values = list(generos_sum.values))])
fig.show()

In [196]:
genero_df = pd.merge(generos_df, total_df, how = 'left', left_index = True, right_index = True)

In [197]:
genero_df.select_dtypes([float, int, bool])

Unnamed: 0_level_0,action,animation,comedy,crime,documentation,drama,european,family,fantasy,history,...,romance,scifi,sport,thriller,war,western,release_year,runtime,imdb_votes,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tm84618,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1976,114,808582.0,8.2
tm154986,1,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,1972,109,107673.0,7.5
tm127384,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1975,91,534486.0,8.0
tm120801,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1967,150,72662.0,7.6
tm70993,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1979,94,395024.0,7.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tm1066324,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2021,25,38.0,5.9
tm1097142,0,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,2021,93,327.0,5.2
tm1014599,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,2021,100,45.0,6.8
tm898842,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,2021,134,348.0,7.7


In [198]:
total_df.sample(20)

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_votes,names,directors,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tm215054,chris tucker: live,comedian chris tucker performs live.,2015,92,"[comedy, documentation]",[US],1955.0,[chris tucker],[phil joanou],6.2
tm49865,aamir,a doctor of indian origin returning to mumbai ...,2008,96,"[thriller, crime, drama, action]",[IN],11290.0,"[rajeev khandelwal, amar kaushik, vasan bala, ...",[raj kumar gupta],7.2
tm39557,jaane tu... ya jaane na,two best friends being convinced that they are...,2008,146,"[drama, comedy, romance]",[IN],26894.0,"[imran khan, genelia d'souza, manjari fadnnis,...",[abbas tyrewala],7.1
tm949309,the white tiger,an ambitious indian driver uses his wit and cu...,2021,125,"[drama, crime]","[SG, US, IN]",59441.0,"[adarsh gourav, rajkummar rao, priyanka chopra...",[ramin bahrani],7.0
tm350374,saawan,a handicapped 9-year old boy who lives in a va...,2016,90,"[thriller, drama]",[PK],253.0,"[saleem mairaj, tipu sharif, imran aslam, syed...",[farhan alam],7.4
tm373655,sarvam thaala mayam,peter who tries to find his place in the world...,2019,133,"[drama, music]",[IN],1231.0,"[g. v. prakash kumar, nedumudi venu, aparna ba...",[rajiv menon],7.4
tm314939,kalushi : the story of solomon mahlangu,solomon mahlangu is a mamelodi township school...,2017,106,"[thriller, drama]",[ZA],321.0,"[pearl thusi, marcel van heerden, welile nzuza...",[mandla dube],6.4
tm824398,the platform,"a mysterious place, an indescribable prison, a...",2019,95,"[scifi, thriller, horror, european]",[ES],213699.0,"[iván massagué, antonia san juan, zorion eguil...",[galder gaztelu-urrutia],7.0
tm371188,#friendbutmarried,"pining for his high school crush for years, a ...",2018,102,"[romance, comedy, drama]",[ID],714.0,"[adipati dolken, vanesha prescilla, rendi jhon...",[rako prijanto],6.8
tm948681,below zero,"on a lonely road, a prison transport is brutal...",2021,106,"[action, crime, thriller, horror, drama]",[ES],19201.0,"[javier gutiérrez, karra elejalde, luis callej...",[lluís quílez],6.3


In [199]:
list_genero = ['action', 'animation', 'comedy', 'crime', 'documentation', 'drama',
       'european', 'family', 'fantasy', 'history', 'horror', 'music',
       'reality', 'romance', 'scifi', 'sport', 'thriller', 'war', 'western']

In [200]:
result_df = genero_df[list_genero].multiply(genero_df['score'], axis='index').replace(0.0, np.nan).mean().reset_index()

In [201]:
sum_df = genero_df[list_genero].multiply(genero_df['imdb_votes'], axis='index').replace(0.0, np.nan).sum().reset_index()

In [202]:
sum_df

Unnamed: 0,index,0
0,action,34934577.0
1,animation,3491150.0
2,comedy,26005688.0
3,crime,27711750.0
4,documentation,2961478.0
5,drama,61133748.0
6,european,12058039.0
7,family,5035885.0
8,fantasy,11365248.0
9,history,5911336.0


In [203]:
result_df.rename(columns={0: 'score'})

Unnamed: 0,index,score
0,action,6.227197
1,animation,6.626568
2,comedy,6.233547
3,crime,6.321915
4,documentation,6.887894
5,drama,6.381093
6,european,6.343023
7,family,6.356571
8,fantasy,6.327389
9,history,6.892414


In [204]:
su_resul_df = result_df.merge(sum_df, how='left', left_on='index', right_on='index').rename(columns={'0_x': 'score', '0_y': 'votes'})

In [205]:
su_resul_df["prom_relative"] = su_resul_df['score'] * (su_resul_df['votes'] / sum(su_resul_df['votes']))

In [206]:
su_resul_df['prom_relative'].sum()

6.287809382443964

In [207]:
su_resul_df

Unnamed: 0,index,score,votes,prom_relative
0,action,6.227197,34934577.0,0.756128
1,animation,6.626568,3491150.0,0.080409
2,comedy,6.233547,26005688.0,0.563444
3,crime,6.321915,27711750.0,0.608919
4,documentation,6.887894,2961478.0,0.070899
5,drama,6.381093,61133748.0,1.355886
6,european,6.343023,12058039.0,0.26584
7,family,6.356571,5035885.0,0.111262
8,fantasy,6.327389,11365248.0,0.249949
9,history,6.892414,5911336.0,0.141613


In [208]:
fig = px.scatter(su_resul_df, x="score", y="votes", size="prom_relative", color="index",
           hover_name="index", log_x=True, size_max=60)
# Adjust the layout
fig.update_layout(
    height=800,  # Set the height of the graph
    width=1400   # Set the width of the graph
)
fig.show()

In [209]:
total_df

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_votes,names,directors,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tm84618,taxi driver,a mentally unstable vietnam war veteran works ...,1976,114,"[drama, crime]",[US],808582.0,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese],8.2
tm154986,deliverance,intent on seeing the cahulawassee river before...,1972,109,"[drama, action, thriller, european]",[US],107673.0,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman],7.5
tm127384,monty python and the holy grail,"king arthur, accompanied by his squire, recrui...",1975,91,"[fantasy, action, comedy]",[GB],534486.0,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]",8.0
tm120801,the dirty dozen,12 american military prisoners in world war ii...,1967,150,"[war, action]","[GB, US]",72662.0,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich],7.6
tm70993,life of brian,"brian cohen is an average young jewish man, bu...",1979,94,[comedy],[GB],395024.0,"[graham chapman, john cleese, terry gilliam, e...",[terry jones],7.9
...,...,...,...,...,...,...,...,...,...,...
tm1066324,super monsters: once upon a rhyme,the super monsters rethink exemplary fantasies...,2021,25,"[animation, family]",[],38.0,,,5.9
tm1097142,my bride,the story follows a young man and woman who go...,2021,93,"[romance, comedy, drama]",[EG],327.0,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir],5.2
tm1014599,fine wine,a beautiful love story that can happen between...,2021,100,"[romance, drama]",[NG],45.0,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope],6.8
tm898842,c/o kaadhal,a heart warming film that explores the concept...,2021,134,[drama],[],348.0,,,7.7


In [210]:
fig = px.scatter(total_df, x="score", y="runtime", color="release_year",
           hover_name="title", log_x=True)
# Adjust the layout
fig.update_layout(
    height=800,  # Set the height of the graph
    width=1800   # Set the width of the graph
)
fig.show()


In [211]:
total_df

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_votes,names,directors,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tm84618,taxi driver,a mentally unstable vietnam war veteran works ...,1976,114,"[drama, crime]",[US],808582.0,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese],8.2
tm154986,deliverance,intent on seeing the cahulawassee river before...,1972,109,"[drama, action, thriller, european]",[US],107673.0,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman],7.5
tm127384,monty python and the holy grail,"king arthur, accompanied by his squire, recrui...",1975,91,"[fantasy, action, comedy]",[GB],534486.0,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]",8.0
tm120801,the dirty dozen,12 american military prisoners in world war ii...,1967,150,"[war, action]","[GB, US]",72662.0,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich],7.6
tm70993,life of brian,"brian cohen is an average young jewish man, bu...",1979,94,[comedy],[GB],395024.0,"[graham chapman, john cleese, terry gilliam, e...",[terry jones],7.9
...,...,...,...,...,...,...,...,...,...,...
tm1066324,super monsters: once upon a rhyme,the super monsters rethink exemplary fantasies...,2021,25,"[animation, family]",[],38.0,,,5.9
tm1097142,my bride,the story follows a young man and woman who go...,2021,93,"[romance, comedy, drama]",[EG],327.0,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir],5.2
tm1014599,fine wine,a beautiful love story that can happen between...,2021,100,"[romance, drama]",[NG],45.0,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope],6.8
tm898842,c/o kaadhal,a heart warming film that explores the concept...,2021,134,[drama],[],348.0,,,7.7


In [212]:
actor_sum

kareena kapoor khan      25
boman irani              25
shah rukh khan           22
amitabh bachchan         20
paresh rawal             20
priyanka chopra jonas    19
nassar                   18
ajay devgn               18
om puri                  18
nawazuddin siddiqui      18
aamir khan               18
anupam kher              18
murali sharma            17
johnny lever             16
naseeruddin shah         16
fred armisen             16
fred tatasciore          16
rani mukerji             16
akshay kumar             15
snoop dogg               15
john abraham             15
rajkummar rao            14
sharat saxena            14
bayoumi fouad            14
sanjay dutt              14
abhishek bachchan        14
anil kapoor              13
rishi kapoor             13
shahid kapoor            13
sanjay mishra            13
dtype: int64

In [213]:
dire_sum

raúl campos            20
jan suter              19
ryan polito            15
jay karas              14
marcus raboy           14
jay chapman            12
cathy garcia-molina    12
youssef chahine        11
justin g. dyck          8
kunle afolayan          8
troy miller             7
anurag kashyap          7
lance bangs             7
shannon hartman         7
jeff tremaine           6
mani ratnam             6
ashutosh gowariker      6
martin scorsese         6
milan luthria           6
leslie small            6
fernando ayllón         6
zoya akhtar             6
karan johar             6
mae cruz-alviar         6
michael simon           5
dtype: int64

In [219]:
actors

Unnamed: 0_level_0,jr.,the creator,'jeeva' ravi,'weird al' yankovic,21 savage,2mex,50 cent,50-grand,87gongzhu,9m88,...,şebnem sönmez,şebnem türkan bilgeer,şenay bozoklar,şenay gürler,şevket çoruh,şinasi yurtsever,şükrü özyıldız,štepánka fingerhutová,игорь павлов,麥沛東
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tm84618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm154986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm127384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm120801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm70993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tm1165179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm985215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm1097142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tm1014599,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [218]:
total_df_not_null

Unnamed: 0_level_0,title,description,release_year,runtime,genres,production_countries,imdb_votes,names,directors,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tm84618,taxi driver,a mentally unstable vietnam war veteran works ...,1976,114,"[drama, crime]",[US],808582.0,"[robert de niro, jodie foster, albert brooks, ...",[martin scorsese],8.2
tm154986,deliverance,intent on seeing the cahulawassee river before...,1972,109,"[drama, action, thriller, european]",[US],107673.0,"[jon voight, burt reynolds, ned beatty, ronny ...",[john boorman],7.5
tm127384,monty python and the holy grail,"king arthur, accompanied by his squire, recrui...",1975,91,"[fantasy, action, comedy]",[GB],534486.0,"[graham chapman, john cleese, eric idle, terry...","[terry jones, terry gilliam]",8.0
tm120801,the dirty dozen,12 american military prisoners in world war ii...,1967,150,"[war, action]","[GB, US]",72662.0,"[lee marvin, ernest borgnine, charles bronson,...",[robert aldrich],7.6
tm70993,life of brian,"brian cohen is an average young jewish man, bu...",1979,94,[comedy],[GB],395024.0,"[graham chapman, john cleese, terry gilliam, e...",[terry jones],7.9
...,...,...,...,...,...,...,...,...,...,...
tm1165179,kongsi raya,jack - a chinese chef-manager who is in-line t...,2022,102,[comedy],[MY],66.0,"[ai leng ong, chew kin-wah, harith iskander, e...",[teddy chan],7.0
tm985215,princess 'daya'reese,reese is a con artist from manila who dreams o...,2021,115,"[comedy, romance]",[PH],50.0,"[maymay entrata, edward barber, snooky serna, ...",[barry gonzalez],7.1
tm1097142,my bride,the story follows a young man and woman who go...,2021,93,"[romance, comedy, drama]",[EG],327.0,"[ahmed hatem, jamila awad, mahmoud al-bezzawy,...",[mohamed bakir],5.2
tm1014599,fine wine,a beautiful love story that can happen between...,2021,100,"[romance, drama]",[NG],45.0,"[richard mofe-damijo, ego nwosu, keppy ekpenyo...",[seyi babatope],6.8


In [242]:
actors_score = actors.merge(total_df_not_null[['score', 'imdb_votes']], how="left", left_index=True, right_index=True)

In [243]:
actors_score

Unnamed: 0_level_0,jr.,the creator,'jeeva' ravi,'weird al' yankovic,21 savage,2mex,50 cent,50-grand,87gongzhu,9m88,...,şenay bozoklar,şenay gürler,şevket çoruh,şinasi yurtsever,şükrü özyıldız,štepánka fingerhutová,игорь павлов,麥沛東,score,imdb_votes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tm84618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8.2,808582.0
tm154986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.5,107673.0
tm127384,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8.0,534486.0
tm120801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.6,72662.0
tm70993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.9,395024.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tm1165179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.0,66.0
tm985215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.1,50.0
tm1097142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.2,327.0
tm1014599,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6.8,45.0


In [257]:
actor_prome = actors_score[list(actor_sum.index)].multiply(actors_score['score'], axis='index').replace(0, np.nan).mean().reset_index()

In [258]:
actor_prome = actor_prome.sort_values(by=0, ascending=False).rename(columns={0:"score"})


In [240]:
actor_prome_30 = actor_prome.head(30)

In [259]:
actor_vote = actors_score[list(actor_sum.index)].multiply(actors_score['imdb_votes'], axis='index').replace(0, np.nan).sum().reset_index()

In [260]:
ac_resul_df = actor_prome.merge(actor_vote, how='left', left_on='index', right_on='index').rename(columns={0: 'votes'})

ac_resul_df["prom_relative"] = ac_resul_df['score'] * (ac_resul_df['votes'] / sum(ac_resul_df['votes']))

ac_resul_df['prom_relative'].sum()

ac_resul_df = ac_resul_df.sort_values(by='prom_relative', ascending=False)

In [283]:
fig = px.histogram(ac_resul_df.head(50), x="index", y="prom_relative", color="prom_relative", marginal="rug", # can be `box`, `violin`
                         hover_data="prom_relative")
fig.show()

In [265]:
director_score = dire.merge(total_df_not_null[['score', 'imdb_votes']], how="left", left_index=True, right_index=True)

director_prome = director_score[list(dire_sum.index)].multiply(director_score['score'], axis='index').replace(0, np.nan).mean().reset_index()
director_prome = director_prome.sort_values(by=0, ascending=False).rename(columns={0:"score"})

director_vote = director_score[list(dire_sum.index)].multiply(director_score['imdb_votes'], axis='index').replace(0, np.nan).sum().reset_index()

dir_resul_df = director_prome.merge(director_vote, how='left', left_on='index', right_on='index').rename(columns={0: 'votes'})

dir_resul_df["prom_relative"] = dir_resul_df['score'] * (dir_resul_df['votes'] / sum(dir_resul_df['votes']))

dir_resul_df['prom_relative'].sum()



In [275]:
dir_resul_df = dir_resul_df.sort_values(by='prom_relative', ascending=False)

In [266]:
dir_resul_df

Unnamed: 0,index,score,votes,prom_relative
56,christopher nolan,8.10,4582943.0,0.390767
66,martin scorsese,8.00,3619616.0,0.304818
11,robert zemeckis,8.60,2021343.0,0.182990
170,guy ritchie,7.60,2242493.0,0.179404
59,quentin tarantino,8.05,2042806.0,0.173106
...,...,...,...,...
2903,saron sakina,,0.0,
2904,shyamal chaulia,,0.0,
2905,tarzan nasser,,0.0,
2906,tolulope itegboje,,0.0,


In [280]:
fig = px.histogram(dir_resul_df.head(50), x="index", y="prom_relative", color="prom_relative", marginal="rug", # can be `box`, `violin`
                         hover_data="score")
fig.show()