In [1]:
# !pip install fuzzywuzzy

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter
from fuzzywuzzy import process, fuzz
import pickle



# Cargamos los datos

In [3]:
# cargamos el csv donde tenemos todos los títulos que tenemos en Netflix
df_titulos = pd.read_csv("data/netflix_titles.csv", index_col = 0 )
df_titulos.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [4]:
# cargamos el csv donde tenemos todas las producciones de Netflix
df_original = pd.read_csv("data/netflix_originals.csv", index_col = 0)
df_original.head(2)

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish


In [5]:
df_titulos.shape

(8807, 12)

In [6]:
df_original.shape

(513, 6)

## Juntamos los dos dataframes

In [7]:
df = df_titulos.merge(df_original, left_on = "title", right_on = "Title",how = "inner")
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",2015,R,82 min,Action & Adventure,"When a retired CIA agent is kidnapped, his son...",Extraction,Action,"April 24, 2020",117,6.7,English


In [8]:
df.shape

(513, 18)

In [9]:
df.drop(["release_year", "Title", "Runtime", "show_id", "description" ], axis = 1, inplace = True)
df.head(1)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,Genre,Premiere,IMDB Score,Language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


Tenemos columnas en mayúsculas y otras en minúsculas, y con espacios. Vamos a intentar homogeneizarlo:

In [10]:
nuevas_columnas = {col: col.strip().replace(" ", "_").lower() for col in df.columns}
nuevas_columnas

{'type': 'type',
 'title': 'title',
 'director': 'director',
 'cast': 'cast',
 'country': 'country',
 'date_added': 'date_added',
 'rating': 'rating',
 'duration': 'duration',
 'listed_in': 'listed_in',
 'Genre': 'genre',
 'Premiere': 'premiere',
 'IMDB Score': 'imdb_score',
 'Language': 'language'}

In [11]:
df.rename(columns = nuevas_columnas, inplace = True)
df.head(1)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


# Análisis exploratorio

In [12]:
df.shape

(513, 13)

In [13]:
df.dtypes

type           object
title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
listed_in      object
genre          object
premiere       object
imdb_score    float64
language       object
dtype: object

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   type        513 non-null    object 
 1   title       513 non-null    object 
 2   director    491 non-null    object 
 3   cast        422 non-null    object 
 4   country     499 non-null    object 
 5   date_added  513 non-null    object 
 6   rating      513 non-null    object 
 7   duration    276 non-null    object 
 8   listed_in   513 non-null    object 
 9   genre       513 non-null    object 
 10  premiere    513 non-null    object 
 11  imdb_score  513 non-null    float64
 12  language    513 non-null    object 
dtypes: float64(1), object(12)
memory usage: 56.1+ KB


In [15]:
df.duplicated().sum()

0

In [16]:
df.isnull().sum()

type            0
title           0
director       22
cast           91
country        14
date_added      0
rating          0
duration      237
listed_in       0
genre           0
premiere        0
imdb_score      0
language        0
dtype: int64

In [17]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
imdb_score,513.0,6.210916,0.96885,2.5,5.6,6.3,6.9,9.0


In [18]:
df.describe(include = "object").T

Unnamed: 0,count,unique,top,freq
type,513,1,Movie,513
title,513,513,Dick Johnson Is Dead,1
director,491,462,McG,3
cast,422,421,Shawn Mendes,2
country,499,81,United States,274
date_added,513,354,"October 18, 2019",5
rating,513,10,TV-MA,254
duration,276,86,98 min,20
listed_in,513,103,Documentaries,69
genre,513,106,Documentary,132


In [19]:
#df.describe(include = "all").T

In [20]:
df.drop(["type"], axis = 1, inplace = True)
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


In [21]:
df_cat = df.select_dtypes(include = "object")
df_cat.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",English


In [22]:
df_num = df.select_dtypes(include = np.number)
df_num.head(2)

Unnamed: 0,imdb_score
0,7.5
1,6.7


In [23]:
for col in df_cat.columns[1:]:
    print(f"La cantidad de valores unicos para la columa {col.upper()} es {len(df[col].unique())}")
    display(pd.DataFrame(df_cat[col].value_counts()))
    print("------")

La cantidad de valores unicos para la columa DIRECTOR es 463


Unnamed: 0,director
McG,3
Amy Poehler,2
Christopher Guest,2
Noah Baumbach,2
Michael Tiddes,2
...,...
Gina Prince-Bythewood,1
Sue Ding,1
Stéphane de Freitas,1
Stefano Mordini,1


------
La cantidad de valores unicos para la columa CAST es 422


Unnamed: 0,cast
Shawn Mendes,2
"Bruce Willis, Kellan Lutz, Gina Carano, D.B. Sweeney, Joshua Mikel, Steve Coulter, Dan Bilzerian, Heather Johansen",1
Loudon Wainwright III,1
"Liam Neeson, James Franco, Tim Blake Nelson, Tom Waits, Zoe Kazan, Brendan Gleeson",1
"Madeline Brewer, Patch Darragh, Melora Walters, Devin Druid, Imani Hakim, Michael Dempsey, Flora Diaz, Samantha Robinson, Jessica Parker Kennedy, Quei Tann",1
...,...
"Marta Etura, Leonardo Sbaraglia, Carlos Librado ""Nene"", Francesc Orella, Imanol Arias, Álvaro Cervantes, Itziar Aizpuru, Benn Northover, Marta Larralde, Alicia Sánchez, Eduardo Rosa, Angel Alkain, Ana Wagener, Paco Tous, Patricia López Arnaiz, Pedro Casablanc",1
"Fulu Mugovhani, Tumi Morake, Bohang Moeko, Yonda Thomas",1
"Nawazuddin Siddiqui, Radhika Apte, Khalid Tyabji, Aditya Srivastava, Padmavati Rao, Shivani Raghuvanshi, Nishant Dahiya, Shweta Tripathi, Gyanendra Tripathi, Shreedhar Dubey, Swanand Kirkire, Riya Shukla, Tigmanshu Dhulia, Ila Arun, Natasha Rastogi",1
Nicolas Anelka,1


------
La cantidad de valores unicos para la columa COUNTRY es 82


Unnamed: 0,country
United States,274
India,35
Italy,13
United Kingdom,13
Spain,12
...,...
"Philippines, United States",1
"United Kingdom, Japan, United States",1
"Spain, United Kingdom",1
"United Kingdom, Hungary, Australia",1


------
La cantidad de valores unicos para la columa DATE_ADDED es 354


Unnamed: 0,date_added
"October 18, 2019",5
"November 1, 2019",5
"October 30, 2020",4
"April 10, 2020",4
"February 5, 2021",4
...,...
"February 12, 2020",1
"February 14, 2020",1
"March 8, 2020",1
"March 13, 2020",1


------
La cantidad de valores unicos para la columa RATING es 10


Unnamed: 0,rating
TV-MA,254
TV-14,94
TV-PG,57
R,47
PG-13,23
TV-G,16
PG,11
TV-Y7,5
TV-Y,5
G,1


------
La cantidad de valores unicos para la columa DURATION es 87


Unnamed: 0,duration
98 min,20
90 min,13
107 min,12
113 min,12
104 min,11
...,...
111 min,1
148 min,1
145 min,1
129 min,1


------
La cantidad de valores unicos para la columa LISTED_IN es 103


Unnamed: 0,listed_in
Documentaries,69
Dramas,22
"Dramas, International Movies",21
"Documentaries, Music & Musicals",20
Comedies,19
...,...
"Independent Movies, International Movies, Thrillers",1
"Comedies, Dramas, LGBTQ Movies",1
Horror Movies,1
"Children & Family Movies, Comedies, Sci-Fi & Fantasy",1


------
La cantidad de valores unicos para la columa GENRE es 106


Unnamed: 0,genre
Documentary,132
Drama,73
Comedy,42
Romantic comedy,35
Thriller,33
...,...
Superhero/Action,1
Dance comedy,1
Animation/Superhero,1
Drama/Horror,1


------
La cantidad de valores unicos para la columa PREMIERE es 350


Unnamed: 0,premiere
"October 2, 2020",5
"November 1, 2019",5
"October 18, 2019",5
"April 10, 2020",4
"December 7, 2018",4
...,...
"January 17, 2020",1
"January 1, 2020",1
"December 26, 2019",1
"December 20, 2019",1


------
La cantidad de valores unicos para la columa LANGUAGE es 37


Unnamed: 0,language
English,352
Hindi,28
Spanish,26
French,18
Italian,14
Indonesian,9
Portuguese,9
English/Spanish,5
Japanese,5
Korean,5


------


In [24]:
df.drop(["listed_in"], axis = 1, inplace = True)
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English


In [25]:
df.isnull().sum()[df.isnull().sum() > 0]

director     22
cast         91
country      14
duration    237
dtype: int64

In [26]:
df[["country", "cast", "director"]] = df[["country", "cast", "director"]].fillna("Desconocido")

In [27]:
df.isnull().sum()[df.isnull().sum() > 0]

duration    237
dtype: int64

In [28]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,Horror,"May 26, 2021",5.2,Thai


In [29]:
df["duration"] = df["duration"].str.split(" ", expand = True).get(0)

In [30]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90.0,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82.0,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118.0,Horror,"May 26, 2021",5.2,Thai


In [31]:
df.dtypes

title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
genre          object
premiere       object
imdb_score    float64
language       object
dtype: object

In [32]:
df["duration2"] = pd.to_numeric(df["duration"])

In [33]:
df.dtypes

title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
genre          object
premiere       object
imdb_score    float64
language       object
duration2     float64
dtype: object

In [34]:
df["duration3"] = df["duration"].astype(float)

In [35]:
df.dtypes

title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
genre          object
premiere       object
imdb_score    float64
language       object
duration2     float64
duration3     float64
dtype: object

In [36]:
df.drop(["duration","duration2" ], axis = 1, inplace = True)
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,Documentary,"October 2, 2020",7.5,English,90.0
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,Action,"April 24, 2020",6.7,English,82.0
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,Thriller,"April 14, 2021",5.7,Polish,
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,Drama,"May 27, 2021",6.7,English,
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,Horror,"May 26, 2021",5.2,Thai,118.0


In [37]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
imdb_score,513.0,6.210916,0.96885,2.5,5.6,6.3,6.9,9.0
duration3,276.0,93.807971,34.883954,5.0,79.75,101.0,119.0,209.0


In [38]:
df["duration3"] = df["duration3"].fillna(df["duration3"].median())

In [39]:
df.head(10)

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,Documentary,"October 2, 2020",7.5,English,90.0
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,Action,"April 24, 2020",6.7,English,82.0
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,Thriller,"April 14, 2021",5.7,Polish,101.0
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,Drama,"May 27, 2021",6.7,English,101.0
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,Horror,"May 26, 2021",5.2,Thai,118.0
5,Baggio: The Divine Ponytail,Letizia Lamartire,"Andrea Arcangeli, Valentina Bellè, Andrea Penn...",Italy,"May 26, 2021",TV-MA,Biopic,"May 26, 2021",6.2,Italian,101.0
6,Nail Bomber: Manhunt,Daniel Vernon,Desconocido,Desconocido,"May 26, 2021",TV-MA,Documentary,"May 26, 2021",6.3,English,73.0
7,Army of the Dead,Zack Snyder,"Dave Bautista, Ella Purnell, Omari Hardwick, G...",United States,"May 21, 2021",R,Zombie/Heist,"May 21, 2021",5.9,English,148.0
8,Sardar Ka Grandson,Kaashvie Nair,"Arjun Kapoor, Neena Gupta, Rakul Preet Singh, ...",India,"May 18, 2021",TV-14,Comedy,"May 18, 2021",4.1,Hindi,140.0
9,Ferry,Cecilia Verheyden,"Frank Lammers, Elise Schaap, Huub Stapel, Moni...","Belgium, Netherlands","May 14, 2021",TV-MA,Crime drama,"May 14, 2021",7.1,Dutch,107.0


In [40]:
df["premiere"] = pd.to_datetime(df["premiere"])
df["date_added"] = pd.to_datetime(df["date_added"])

In [41]:
df.dtypes

title                 object
director              object
cast                  object
country               object
date_added    datetime64[ns]
rating                object
genre                 object
premiere      datetime64[ns]
imdb_score           float64
language              object
duration3            float64
dtype: object

In [42]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",2021-09-01,R,Action,2020-04-24,6.7,English,82.0
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,2021-06-30,TV-MA,Thriller,2021-04-14,5.7,Polish,101.0
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,2021-05-27,TV-PG,Drama,2021-05-27,6.7,English,101.0
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,2021-05-27,TV-MA,Horror,2021-05-26,5.2,Thai,118.0


In [43]:
df["cast"] = df["cast"].str.split(",")

In [44]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,[Desconocido],United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0
1,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,Action,2020-04-24,6.7,English,82.0
2,Prime Time,Jakub Piątek,"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,Thriller,2021-04-14,5.7,Polish,101.0
3,Blue Miracle,Julio Quintana,"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,Drama,2021-05-27,6.7,English,101.0
4,Ghost Lab,Paween Purijitpanya,"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,Horror,2021-05-26,5.2,Thai,118.0


In [45]:
df["cast"][1]

['Bruce Willis',
 ' Kellan Lutz',
 ' Gina Carano',
 ' D.B. Sweeney',
 ' Joshua Mikel',
 ' Steve Coulter',
 ' Dan Bilzerian',
 ' Heather Johansen']

In [46]:
df.shape

(513, 11)

In [47]:
df_actor = df.explode("cast")
df_actor.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0


In [48]:
df_actor = df_actor[["title", "cast"]]
df_actor.head()

Unnamed: 0,title,cast
0,Dick Johnson Is Dead,Desconocido
1,Extraction,Bruce Willis
1,Extraction,Kellan Lutz
1,Extraction,Gina Carano
1,Extraction,D.B. Sweeney


In [49]:
df["director"][16]

'Robert Pulcini, Shari Springer Berman'

In [50]:
df["director"].unique().tolist()[:10]

['Kirsten Johnson',
 'Steven C. Miller',
 'Jakub Piątek',
 'Julio Quintana',
 'Paween Purijitpanya',
 'Letizia Lamartire',
 'Daniel Vernon',
 'Zack Snyder',
 'Kaashvie Nair',
 'Cecilia Verheyden']

In [51]:
df["director"] = df["director"].str.split(",")
df_director = df.explode("director")
#df_director.head(20)

In [52]:
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,[Kirsten Johnson],[Desconocido],United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0


In [53]:
df_actor.head(1)

Unnamed: 0,title,cast
0,Dick Johnson Is Dead,Desconocido


In [54]:
df_director.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,[Desconocido],United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0


In [55]:
df_actor["cast"].value_counts()

Desconocido           91
Adam Sandler           7
 Maya Rudolph          6
 Andrew Bachelor       5
 Lakeith Stanfield     4
                      ..
 Cree Cicchino         1
 Lucas Jaye            1
 Karla Souza           1
 Enuka Okuma           1
David Sampliner        1
Name: cast, Length: 3005, dtype: int64

In [56]:
df_groupby_actor = df_actor.groupby("cast").count()
df_groupby_actor

Unnamed: 0_level_0,title
cast,Unnamed: 1_level_1
Aakshath Das,1
Aaron Abrams,1
Aaron Douglas,1
Aaron Guy,1
Aaron Stanford,1
...,...
Zion Clark,1
Zoey Deutch,1
Álvaro Cervantes,1
Çağatay Ulusoy,1


In [57]:
df_groupby_actor = df_groupby_actor.sort_values(by = "title", ascending = False)

In [58]:
df_groupby_actor.head(15)

Unnamed: 0_level_0,title
cast,Unnamed: 1_level_1
Desconocido,91
Adam Sandler,7
Maya Rudolph,6
Andrew Bachelor,5
Keegan-Michael Key,4
Robbie Amell,4
Jacki Weaver,4
Rob Schneider,4
Nick Swardson,4
Lakeith Stanfield,4


In [59]:
top_10_actor = df_groupby_actor.iloc[1:11, :]
top_10_actor

Unnamed: 0_level_0,title
cast,Unnamed: 1_level_1
Adam Sandler,7
Maya Rudolph,6
Andrew Bachelor,5
Keegan-Michael Key,4
Robbie Amell,4
Jacki Weaver,4
Rob Schneider,4
Nick Swardson,4
Lakeith Stanfield,4
Zachary Quinto,4


In [60]:
top_10_actor = top_10_actor.index.tolist()
top_10_actor

['Adam Sandler',
 ' Maya Rudolph',
 ' Andrew Bachelor',
 ' Keegan-Michael Key',
 ' Robbie Amell',
 ' Jacki Weaver',
 ' Rob Schneider',
 ' Nick Swardson',
 ' Lakeith Stanfield',
 ' Zachary Quinto']

In [61]:
top_10_actor = [actor.strip() for actor in top_10_actor]
top_10_actor

['Adam Sandler',
 'Maya Rudolph',
 'Andrew Bachelor',
 'Keegan-Michael Key',
 'Robbie Amell',
 'Jacki Weaver',
 'Rob Schneider',
 'Nick Swardson',
 'Lakeith Stanfield',
 'Zachary Quinto']

In [62]:
with open("data/top_ten_actor.pkl", "wb") as actor: 
    pickle.dump(top_10_actor, actor)

In [63]:
df_director.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,Kirsten Johnson,[Desconocido],United States,2021-09-25,PG-13,Documentary,2020-10-02,7.5,English,90.0
1,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,Action,2020-04-24,6.7,English,82.0
2,Prime Time,Jakub Piątek,"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,Thriller,2021-04-14,5.7,Polish,101.0
3,Blue Miracle,Julio Quintana,"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,Drama,2021-05-27,6.7,English,101.0
4,Ghost Lab,Paween Purijitpanya,"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,Horror,2021-05-26,5.2,Thai,118.0


In [64]:
df["genre"].unique()

array(['Documentary', 'Action', 'Thriller', 'Drama', 'Horror', 'Biopic',
       'Zombie/Heist', 'Comedy', 'Crime drama', 'Psychological thriller',
       'Science fiction thriller', 'Animated musical comedy',
       'Psychological thriller drama', 'Superhero-Comedy',
       'Romantic comedy', 'Christian musical',
       'Hidden-camera prank comedy', 'Comedy-drama',
       'Romantic teen drama', 'Romantic drama', 'Science fiction',
       'Action/Science fiction', 'Animation / Short', 'Superhero',
       'Aftershow / Interview', 'Musical', 'Animation', 'Concert Film',
       'Christmas comedy', 'Stop Motion', 'Family/Christmas musical',
       'Anthology/Dark comedy', 'Mystery', 'Romantic comedy/Holiday',
       'Variety show', 'Animation/Musical/Adventure', 'Romantic thriller',
       'Comedy/Fantasy/Family', 'Horror comedy', 'Action comedy',
       'Family', 'Comedy/Horror', 'Drama/Horror', 'Animation/Superhero',
       'Dance comedy', 'Superhero/Action', 'Romantic teenage drama',
   

In [65]:
df["genre"] = df["genre"].str.lower()

In [66]:
df["genre"].unique()[:10]

array(['documentary', 'action', 'thriller', 'drama', 'horror', 'biopic',
       'zombie/heist', 'comedy', 'crime drama', 'psychological thriller'],
      dtype=object)

In [75]:
df["genre2"] = df["genre"].str.split("/")

In [68]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3
0,Dick Johnson Is Dead,[Kirsten Johnson],[Desconocido],United States,2021-09-25,PG-13,documentary,2020-10-02,7.5,English,90.0
1,Extraction,[Steven C. Miller],"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,action,2020-04-24,6.7,English,82.0
2,Prime Time,[Jakub Piątek],"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,thriller,2021-04-14,5.7,Polish,101.0
3,Blue Miracle,[Julio Quintana],"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,drama,2021-05-27,6.7,English,101.0
4,Ghost Lab,[Paween Purijitpanya],"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,horror,2021-05-26,5.2,Thai,118.0


In [69]:
df["genre"].value_counts()

documentary            132
drama                   73
comedy                  42
romantic comedy         35
thriller                33
                      ... 
superhero/action         1
dance comedy             1
animation/superhero      1
drama/horror             1
mockumentary             1
Name: genre, Length: 106, dtype: int64

In [82]:
cuenta_generos = Counter(elemento for generos in df["genre2"] for elemento in generos)
#cuenta_generos

In [90]:
diccionario_generos = dict(cuenta_generos.most_common(10))
diccionario_generos

{'documentary': 132,
 'drama': 77,
 'comedy': 50,
 'thriller': 37,
 'romantic comedy': 36,
 'comedy-drama': 15,
 'science fiction': 14,
 'horror': 12,
 'action': 11,
 'crime drama': 11}

In [106]:
def sacar_parecido(col, diccionario):
    maximo = 0
    for key in diccionario:
        comparo = fuzz.ratio(key, col)

        if comparo > maximo:
            resultado = key
            maximo = comparo
            
            
    if maximo > 60:
        return resultado
    else:
        return "otro"
       



In [107]:
df["genre3"] = df.apply(lambda x: sacar_parecido(x["genre"],diccionario_generos ), axis = 1)

In [108]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3,genre2,genre3
0,Dick Johnson Is Dead,[Kirsten Johnson],[Desconocido],United States,2021-09-25,PG-13,documentary,2020-10-02,7.5,English,90.0,[documentary],documentary
1,Extraction,[Steven C. Miller],"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,action,2020-04-24,6.7,English,82.0,[action],action
2,Prime Time,[Jakub Piątek],"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,thriller,2021-04-14,5.7,Polish,101.0,[thriller],thriller
3,Blue Miracle,[Julio Quintana],"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,drama,2021-05-27,6.7,English,101.0,[drama],drama
4,Ghost Lab,[Paween Purijitpanya],"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,horror,2021-05-26,5.2,Thai,118.0,[horror],horror


In [109]:
df["genre3"].value_counts()

documentary        133
otro                83
drama               75
romantic comedy     57
comedy              50
thriller            47
science fiction     17
comedy-drama        17
action              12
crime drama         12
horror              10
Name: genre3, dtype: int64

In [91]:
fuzz.ratio("documetary", "patata")

25

In [88]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,genre,premiere,imdb_score,language,duration3,genre2
0,Dick Johnson Is Dead,[Kirsten Johnson],[Desconocido],United States,2021-09-25,PG-13,documentary,2020-10-02,7.5,English,90.0,[documentary]
1,Extraction,[Steven C. Miller],"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,action,2020-04-24,6.7,English,82.0,[action]
2,Prime Time,[Jakub Piątek],"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,thriller,2021-04-14,5.7,Polish,101.0,[thriller]
3,Blue Miracle,[Julio Quintana],"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,drama,2021-05-27,6.7,English,101.0,[drama]
4,Ghost Lab,[Paween Purijitpanya],"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,horror,2021-05-26,5.2,Thai,118.0,[horror]


In [79]:
for generos in df["genre2"]:
    for elemento in generos:
        print(elemento)

documentary
action
thriller
drama
horror
biopic
documentary
zombie
heist
comedy
crime drama
thriller
psychological thriller
science fiction thriller
drama
drama
drama
horror
comedy
drama
animated musical comedy
psychological thriller drama
documentary
drama
superhero-comedy
documentary
drama
romantic comedy
drama
drama
christian musical
hidden-camera prank comedy
comedy-drama
romantic teen drama
documentary
comedy
drama
action
drama
documentary
romantic comedy
thriller
romantic drama
romantic drama
thriller
romantic comedy
comedy
romantic drama
science fiction
documentary
romantic drama
drama
drama
drama
drama
comedy-drama
action
science fiction
documentary
comedy-drama
documentary
drama
drama
documentary
documentary
comedy
animation 
 short
comedy
thriller
superhero
science fiction
drama
aftershow 
 interview
romantic comedy
animation 
 short
documentary
musical
comedy
thriller
comedy
comedy
animation
drama
documentary
drama
drama
concert film
christmas comedy
drama
drama
documentary


In [72]:
df_director_top = df_director["director"].value_counts().reset_index().iloc[1:11, :]
df_director_top

Unnamed: 0,index,director
1,McG,3
2,Blair Simmons,2
3,John Schultz,2
4,Noah Baumbach,2
5,Karan Johar,2
6,Dibakar Banerjee,2
7,Julien Leclercq,2
8,Amy Poehler,2
9,Kyle Newacheck,2
10,Peter Sullivan,2


In [73]:
df_director_top = [dire.strip() for dire in df_director_top["index"].tolist()]
df_director_top

['McG',
 'Blair Simmons',
 'John Schultz',
 'Noah Baumbach',
 'Karan Johar',
 'Dibakar Banerjee',
 'Julien Leclercq',
 'Amy Poehler',
 'Kyle Newacheck',
 'Peter Sullivan']

In [74]:
with open("data/top_ten_director.pkl", "wb") as dire: 
    pickle.dump(df_director_top, dire)

Vemos que para la columna de `type` solo tenemos un tipo, ¿merece la pena entonces mantener esta columna? La verdad es que no, así que vamos a eliminarla

Dentro del análisis exploratorio es importante conocer todas las variables, con las categóricas es importante ver todas las categorías que tenemos para cada una de ellas y cuáles son sus frecuencias. 

Si bien es cierto que hemos sacado todas la categóricas, ¿tiene sentido explorarlas todas? Podríamos pensar que variables como los títulos de cada peli no nos interesa, porque tienen demasiadas categorías. En este caso podremos eliminarlas de nuestro df_cat

Al ver todos los valores únicos podemos ver que las columnas de `lister_in` y `genre` nos dan la misma información, así que procedemos a eliminar una de ellas: 

## Limpieza `director`, `cast`, `country`

Estas columnas eran las que tenían valores nulos

Imagineos que queremos cambiar el orden de las columnas, cambiemos el orden usando el método `reindex`. 

# Limpiamos la columna de `duration` 

# Limpiamos `date_added` y `premiere`

## Clean `cast` y `director`


Aplicamos la misma lógica para la columna de directores

## Limpiamos `genre` 


# Top 10 directores y actores