In [1]:
#!pip install fuzzywuzzy

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter
from fuzzywuzzy import process, fuzz
import pickle



# Cargamos los datos

In [3]:
# cargamos el csv donde tenemos todos los títulos que tenemos en Netflix
df_titulos = pd.read_csv("data/netflix_titles.csv", index_col = 0 )
df_titulos.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [4]:
# cargamos el csv donde tenemos todas las producciones de Netflix
df_original = pd.read_csv("data/netflix_originals.csv", index_col = 0)
df_original.head(2)

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish


In [5]:
df_titulos.shape

(8807, 12)

In [6]:
df_original.shape

(513, 6)

# MIO

In [7]:
df = df_titulos.merge(df_original, left_on='title', right_on='Title', how = 'inner')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",2015,R,82 min,Action & Adventure,"When a retired CIA agent is kidnapped, his son...",Extraction,Action,"April 24, 2020",117,6.7,English
2,s625,Movie,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",2021,TV-MA,,"Dramas, International Movies, Thrillers","On New Year’s Eve 1999, an armed man enters a ...",Prime Time,Thriller,"April 14, 2021",91,5.7,Polish
3,s835,Movie,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",2021,TV-PG,,"Children & Family Movies, Dramas, Faith & Spir...","To save their cash-strapped orphanage, a guard...",Blue Miracle,Drama,"May 27, 2021",95,6.7,English
4,s837,Movie,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",2021,TV-MA,118 min,"Horror Movies, International Movies, Thrillers","After witnessing a haunting in their hospital,...",Ghost Lab,Horror,"May 26, 2021",117,5.2,Thai


In [8]:
##me cargo las columnas que no me gustan
df.drop(['release_year', 'Title', 'Runtime', 'show_id', 'description'], axis = 1, inplace=True)
df.head(1)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,Genre,Premiere,IMDB Score,Language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


In [9]:
## voy a renombrar  mis columnas
nuevas_columnas = {col: col.strip().replace(' ', '_').lower() for col in df.columns}
nuevas_columnas

{'type': 'type',
 'title': 'title',
 'director': 'director',
 'cast': 'cast',
 'country': 'country',
 'date_added': 'date_added',
 'rating': 'rating',
 'duration': 'duration',
 'listed_in': 'listed_in',
 'Genre': 'genre',
 'Premiere': 'premiere',
 'IMDB Score': 'imdb_score',
 'Language': 'language'}

In [10]:
df.rename(columns=nuevas_columnas, inplace=True)
df.head(1)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


## análisis exploratorio

In [11]:
df.shape

(513, 13)

In [12]:
df.dtypes

type           object
title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
listed_in      object
genre          object
premiere       object
imdb_score    float64
language       object
dtype: object

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   type        513 non-null    object 
 1   title       513 non-null    object 
 2   director    491 non-null    object 
 3   cast        422 non-null    object 
 4   country     499 non-null    object 
 5   date_added  513 non-null    object 
 6   rating      513 non-null    object 
 7   duration    276 non-null    object 
 8   listed_in   513 non-null    object 
 9   genre       513 non-null    object 
 10  premiere    513 non-null    object 
 11  imdb_score  513 non-null    float64
 12  language    513 non-null    object 
dtypes: float64(1), object(12)
memory usage: 56.1+ KB


In [14]:
df.duplicated(subset = 'language').sum() ##es un resultado lógio en realidad

476

In [15]:
df.isnull().sum() ## tengo muchas categóricas como nulos, las llenaré con desconocidos

type            0
title           0
director       22
cast           91
country        14
date_added      0
rating          0
duration      237
listed_in       0
genre           0
premiere        0
imdb_score      0
language        0
dtype: int64

In [16]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
imdb_score,513.0,6.210916,0.96885,2.5,5.6,6.3,6.9,9.0


In [17]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
type,513,1,Movie,513
title,513,513,Dick Johnson Is Dead,1
director,491,462,McG,3
cast,422,421,Shawn Mendes,2
country,499,81,United States,274
date_added,513,354,"October 18, 2019",5
rating,513,10,TV-MA,254
duration,276,86,98 min,20
listed_in,513,103,Documentaries,69
genre,513,106,Documentary,132


In [18]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
type,513.0,1.0,Movie,513.0,,,,,,,
title,513.0,513.0,Dick Johnson Is Dead,1.0,,,,,,,
director,491.0,462.0,McG,3.0,,,,,,,
cast,422.0,421.0,Shawn Mendes,2.0,,,,,,,
country,499.0,81.0,United States,274.0,,,,,,,
date_added,513.0,354.0,"October 18, 2019",5.0,,,,,,,
rating,513.0,10.0,TV-MA,254.0,,,,,,,
duration,276.0,86.0,98 min,20.0,,,,,,,
listed_in,513.0,103.0,Documentaries,69.0,,,,,,,
genre,513.0,106.0,Documentary,132.0,,,,,,,


In [19]:
## me cargo la columna de type, porque tiene solo un registro
df.drop(['type'], axis=1, inplace=True)
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English


In [20]:
## vamos a ver que categorías tienen los datos únicos
df_cat = df.select_dtypes(include='object')
df_cat.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",English


In [21]:
## puedo tb seleccionar las de tipo numérico
df_num = df.select_dtypes(include=np.number)
df_num.head(1)

Unnamed: 0,imdb_score
0,7.5


In [22]:
for col in df_cat.columns[1:]:
    print(f"la cantidad de valores únicos de la columna es {col.upper()} es {len(df[col].unique())}")
    display(pd.DataFrame(df_cat[col].value_counts()))
    print("----")
    break

la cantidad de valores únicos de la columna es DIRECTOR es 463


Unnamed: 0,director
McG,3
Amy Poehler,2
Christopher Guest,2
Noah Baumbach,2
Michael Tiddes,2
...,...
Gina Prince-Bythewood,1
Sue Ding,1
Stéphane de Freitas,1
Stefano Mordini,1


----


mirando el dataframe, la columna de cast habrá que separarla para ver los actores únicos -- lo mismo pasa con la comlumna de países, existen las coproducciones

In [23]:
df.drop(['listed_in'], axis = 1, inplace=True)
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English


In [24]:
df.isnull().sum()[df.isnull().sum() > 0]

director     22
cast         91
country      14
duration    237
dtype: int64

In [25]:
## ojo, si quiero rellenar varias columnas tengo que asignarlas, no hacer el inplace
df[['director', 'country', 'cast']] = df[['director', 'country', 'cast']].fillna('Desconocido')

In [26]:
df.isnull().sum()[df.isnull().sum() > 0]

duration    237
dtype: int64

In [27]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,Horror,"May 26, 2021",5.2,Thai


In [28]:
## limpio ahora la columna de duration
df['duration'] = df['duration'].str.split(' ', expand=True).get(0)
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90.0,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82.0,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Desconocido,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118.0,Horror,"May 26, 2021",5.2,Thai


In [29]:
df['duration2'] = pd.to_numeric(df['duration'])
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language,duration2
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90,Documentary,"October 2, 2020",7.5,English,90.0


In [30]:
df['duration3'] = df['duration'].astype(int, errors='ignore') ## va como el orto, mejor el pd.to_numeric

In [31]:
df.dtypes

title          object
director       object
cast           object
country        object
date_added     object
rating         object
duration       object
genre          object
premiere       object
imdb_score    float64
language       object
duration2     float64
duration3      object
dtype: object

In [32]:
df['duration'] = pd.to_numeric(df['duration'])
df.drop(['duration2', 'duration3'], axis = 1, inplace=True)

In [33]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,276.0,93.807971,34.883954,5.0,79.75,101.0,119.0,209.0
imdb_score,513.0,6.210916,0.96885,2.5,5.6,6.3,6.9,9.0


In [34]:
df[df['duration'] == min(df['duration'])]

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
206,Sol Levante,Akira Saitoh,Desconocido,Japan,"April 2, 2020",TV-14,5.0,Anime / Short,"April 2, 2020",4.7,English


In [35]:
df[df['duration'].isnull()]['genre'].value_counts()

Documentary                    47
Drama                          33
Comedy                         22
Romantic comedy                21
Thriller                       15
                               ..
Sports film                     1
Adventure-romance               1
Coming-of-age comedy-drama      1
Animation / Science Fiction     1
Comedy / Musical                1
Name: genre, Length: 66, dtype: int64

In [36]:
df['duration'] = df['duration'].fillna(df['duration'].median())
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,"September 25, 2021",PG-13,90.0,Documentary,"October 2, 2020",7.5,English


In [37]:
## cambio el formato de la fecha
df['premiere'] = pd.to_datetime(df['premiere'])
df['date_added'] = pd.to_datetime(df['date_added'])
df.dtypes

title                 object
director              object
cast                  object
country               object
date_added    datetime64[ns]
rating                object
duration             float64
genre                 object
premiere      datetime64[ns]
imdb_score           float64
language              object
dtype: object

In [38]:
df.head(1)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,2021-09-25,PG-13,90.0,Documentary,2020-10-02,7.5,English


In [39]:
## vamos a limpiar la columna del cast -- creandonos un dataframe aparte
df['cast'] = df['cast'].str.split(',')
df['cast'][1]

['Bruce Willis',
 ' Kellan Lutz',
 ' Gina Carano',
 ' D.B. Sweeney',
 ' Joshua Mikel',
 ' Steve Coulter',
 ' Dan Bilzerian',
 ' Heather Johansen']

In [40]:
df_actor = df.explode('cast')
df_actor.shape

(3444, 11)

In [41]:
df_actor.head(5)

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Desconocido,United States,2021-09-25,PG-13,90.0,Documentary,2020-10-02,7.5,English
1,Extraction,Steven C. Miller,Bruce Willis,"United States, United Kingdom, Canada",2021-09-01,R,82.0,Action,2020-04-24,6.7,English
1,Extraction,Steven C. Miller,Kellan Lutz,"United States, United Kingdom, Canada",2021-09-01,R,82.0,Action,2020-04-24,6.7,English
1,Extraction,Steven C. Miller,Gina Carano,"United States, United Kingdom, Canada",2021-09-01,R,82.0,Action,2020-04-24,6.7,English
1,Extraction,Steven C. Miller,D.B. Sweeney,"United States, United Kingdom, Canada",2021-09-01,R,82.0,Action,2020-04-24,6.7,English


In [42]:
df_actor = df_actor[['title', 'cast']] 
df_actor.head()

Unnamed: 0,title,cast
0,Dick Johnson Is Dead,Desconocido
1,Extraction,Bruce Willis
1,Extraction,Kellan Lutz
1,Extraction,Gina Carano
1,Extraction,D.B. Sweeney


In [43]:
df['director'].unique().tolist()[:10]

['Kirsten Johnson',
 'Steven C. Miller',
 'Jakub Piątek',
 'Julio Quintana',
 'Paween Purijitpanya',
 'Letizia Lamartire',
 'Daniel Vernon',
 'Zack Snyder',
 'Kaashvie Nair',
 'Cecilia Verheyden']

In [44]:
df['director'] = df['director'].str.split(',')
df_director = df.explode('director')
df_director = df_director[['title', 'director']]
df_director.head(20)

Unnamed: 0,title,director
0,Dick Johnson Is Dead,Kirsten Johnson
1,Extraction,Steven C. Miller
2,Prime Time,Jakub Piątek
3,Blue Miracle,Julio Quintana
4,Ghost Lab,Paween Purijitpanya
5,Baggio: The Divine Ponytail,Letizia Lamartire
6,Nail Bomber: Manhunt,Daniel Vernon
7,Army of the Dead,Zack Snyder
8,Sardar Ka Grandson,Kaashvie Nair
9,Ferry,Cecilia Verheyden


los 10 actores que más salen en películas de netflix

In [45]:
df_groupby_actr = df_actor.groupby(['cast']).count().sort_values(by='title', ascending=False)
df_groupby_actr.head(10)

Unnamed: 0_level_0,title
cast,Unnamed: 1_level_1
Desconocido,91
Adam Sandler,7
Maya Rudolph,6
Andrew Bachelor,5
Keegan-Michael Key,4
Robbie Amell,4
Jacki Weaver,4
Rob Schneider,4
Nick Swardson,4
Lakeith Stanfield,4


In [46]:
top_10_actor = df_groupby_actr.iloc[1:11, :]#.reset_index()
top_10_actor = top_10_actor.index.tolist()

In [47]:
top_10_actor = [actor.strip() for actor in top_10_actor]
top_10_actor

['Adam Sandler',
 'Maya Rudolph',
 'Andrew Bachelor',
 'Keegan-Michael Key',
 'Robbie Amell',
 'Jacki Weaver',
 'Rob Schneider',
 'Nick Swardson',
 'Lakeith Stanfield',
 'Zachary Quinto']

In [48]:
## otra forma de hacerlo
df_actor['cast'].value_counts()

Desconocido           91
Adam Sandler           7
 Maya Rudolph          6
 Andrew Bachelor       5
 Lakeith Stanfield     4
                      ..
 Cree Cicchino         1
 Lucas Jaye            1
 Karla Souza           1
 Enuka Okuma           1
David Sampliner        1
Name: cast, Length: 3005, dtype: int64

voy a guardar actores

In [49]:
## crea el archivo en formato pickle, para que luego lo tengamos disponible para otras veces
with open('data/top_ten_actor.pickle', 'wb') as actor:
    pickle.dump(top_10_actor, actor)

In [50]:
df_director.head()

Unnamed: 0,title,director
0,Dick Johnson Is Dead,Kirsten Johnson
1,Extraction,Steven C. Miller
2,Prime Time,Jakub Piątek
3,Blue Miracle,Julio Quintana
4,Ghost Lab,Paween Purijitpanya


In [51]:
df_director_top = df_director['director'].value_counts().reset_index().iloc[1:11]
df_director_top

Unnamed: 0,index,director
1,McG,3
2,Blair Simmons,2
3,John Schultz,2
4,Noah Baumbach,2
5,Karan Johar,2
6,Dibakar Banerjee,2
7,Julien Leclercq,2
8,Amy Poehler,2
9,Kyle Newacheck,2
10,Peter Sullivan,2


In [52]:
top_10_director = [dire.strip() for dire in df_director_top['index'].tolist()]
top_10_director

['McG',
 'Blair Simmons',
 'John Schultz',
 'Noah Baumbach',
 'Karan Johar',
 'Dibakar Banerjee',
 'Julien Leclercq',
 'Amy Poehler',
 'Kyle Newacheck',
 'Peter Sullivan']

In [53]:
df['genre'].unique()[:10]

array(['Documentary', 'Action', 'Thriller', 'Drama', 'Horror', 'Biopic',
       'Zombie/Heist', 'Comedy', 'Crime drama', 'Psychological thriller'],
      dtype=object)

In [54]:
df['genre'] = df['genre'].str.lower()

In [55]:
df['genre'].unique()[:10]

array(['documentary', 'action', 'thriller', 'drama', 'horror', 'biopic',
       'zombie/heist', 'comedy', 'crime drama', 'psychological thriller'],
      dtype=object)

In [56]:
df['genre2'] = df['genre'].str.split('/')

In [57]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language,genre2
0,Dick Johnson Is Dead,[Kirsten Johnson],[Desconocido],United States,2021-09-25,PG-13,90.0,documentary,2020-10-02,7.5,English,[documentary]
1,Extraction,[Steven C. Miller],"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",2021-09-01,R,82.0,action,2020-04-24,6.7,English,[action]
2,Prime Time,[Jakub Piątek],"[Bartosz Bielenia, Magdalena Popławska, Andr...",Desconocido,2021-06-30,TV-MA,101.0,thriller,2021-04-14,5.7,Polish,[thriller]
3,Blue Miracle,[Julio Quintana],"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,2021-05-27,TV-PG,101.0,drama,2021-05-27,6.7,English,[drama]
4,Ghost Lab,[Paween Purijitpanya],"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,2021-05-27,TV-MA,118.0,horror,2021-05-26,5.2,Thai,[horror]


In [58]:
## pa ver cuantos géneros hay, lo vamos a hacer con counter
## el counter necesita strings, por lo que tenemos que iterar por las listas que tengo
cuenta_generos = Counter(elementos.strip() for generos in df['genre2'] for elementos in generos)
cuenta_generos

Counter({'documentary': 132,
         'action': 11,
         'thriller': 37,
         'drama': 77,
         'horror': 12,
         'biopic': 8,
         'zombie': 1,
         'heist': 2,
         'comedy': 53,
         'crime drama': 11,
         'psychological thriller': 3,
         'science fiction thriller': 1,
         'animated musical comedy': 1,
         'psychological thriller drama': 1,
         'superhero-comedy': 1,
         'romantic comedy': 36,
         'christian musical': 1,
         'hidden-camera prank comedy': 1,
         'comedy-drama': 15,
         'romantic teen drama': 1,
         'romantic drama': 5,
         'science fiction': 15,
         'animation': 15,
         'short': 5,
         'superhero': 4,
         'aftershow': 4,
         'interview': 4,
         'musical': 6,
         'concert film': 1,
         'christmas comedy': 1,
         'stop motion': 1,
         'family': 4,
         'christmas musical': 1,
         'anthology': 1,
         'dark comedy': 

In [59]:
cuenta_generos.most_common(10)

[('documentary', 132),
 ('drama', 77),
 ('comedy', 53),
 ('thriller', 37),
 ('romantic comedy', 36),
 ('comedy-drama', 15),
 ('science fiction', 15),
 ('animation', 15),
 ('horror', 12),
 ('action', 11)]

vamos a ver si hay cosas en común entre los strings de géneros

In [60]:
fuzz.ratio('documentary', 'romantic comedy')

38

In [61]:
# (most common, mis generos))
fuzz.ratio('documentary', 'patata')

24

In [64]:
dict_generos = dict(cuenta_generos.most_common(10))
dict_generos

{'documentary': 132,
 'drama': 77,
 'comedy': 53,
 'thriller': 37,
 'romantic comedy': 36,
 'comedy-drama': 15,
 'science fiction': 15,
 'animation': 15,
 'horror': 12,
 'action': 11}

In [80]:
def sacar_parecido(col, diccionario_generos):   
    maximo = 0
    for key in diccionario_generos.keys():
    #print(key)
        #print(genero)
        comparo = fuzz.ratio(key, col)
        if comparo > maximo:
            maximo = comparo
            resultado = key
    if maximo > 60:
        return resultado
    else:
        return 'otro'

In [81]:
df['genero'] = df.apply(lambda x: sacar_parecido(x['genre'], dict_generos), axis=1)
df['genero'].value_counts()

documentary        133
otro                79
drama               75
romantic comedy     57
comedy              50
thriller            47
comedy-drama        28
science fiction     17
horror              10
animation           10
action               7
Name: genero, dtype: int64

# ORIGINAL
## Juntamos los dos dataframes

In [None]:
# juntamos los dos dataframes
df = df_titulos.merge(df_original, left_on= "title", right_on="Title", how="inner")
df.head(2)

In [None]:

# Dropping the columns that are not needed for the analysis.
df.drop(['release_year', 'Runtime', 'description','Title', "show_id"], axis = 1, inplace=True)
df.head(2)

Tenemos columnas en mayúsculas y otras en minúsculas, y con espacios. Vamos a intentar homogeneizarlo:

In [None]:
columnas_nuevas = {col : col.replace(" ", "_").lower() for col in df.columns}
print(columnas_nuevas)

In [None]:
df.rename(columns = columnas_nuevas, inplace = True)
df.head(2)

# Análisis exploratorio

In [None]:
# Número de filas  y columnas del dataframe
df.shape

In [None]:
# Información básica de cada una de las columnas del df
df.info()

In [None]:
# Contamos el número de filas duplicadas del dataframe
df.duplicated().sum()

In [None]:
# Contamos el número de nulos del dataframe
df.isnull().sum()

In [None]:
# Principales estadísticos de las columnas numéricas
df.describe().T

In [None]:
# Principales estadísticos de las columnas categóricas
df.describe(include = "object").T

Vemos que para la columna de `type` solo tenemos un tipo, ¿merece la pena entonces mantener esta columna? La verdad es que no, así que vamos a eliminarla

In [None]:
df.drop(["type"], axis = 1, inplace = True)
df.head(2)

Dentro del análisis exploratorio es importante conocer todas las variables, con las categóricas es importante ver todas las categorías que tenemos para cada una de ellas y cuáles son sus frecuencias. 

In [None]:
# creemos un dataframe solo con las variables categóricas

df_cat = df.select_dtypes(include = "object")
df_cat.head()

Si bien es cierto que hemos sacado todas la categóricas, ¿tiene sentido explorarlas todas? Podríamos pensar que variables como los títulos de cada peli no nos interesa, porque tienen demasiadas categorías. En este caso podremos eliminarlas de nuestro df_cat

In [None]:
df_cat.drop(["title"], axis = 1, inplace = True)
df_cat.head()

In [None]:
for columna in df_cat.columns:
    print(f"la cantidad de valores únicos para la columna {columna.upper()} son {len(df_cat[columna].unique())} y estos valores son")
    (display(pd.DataFrame(df_cat[columna].value_counts())))
    print("---------------------------------------------")

Al ver todos los valores únicos podemos ver que las columnas de `lister_in` y `genre` nos dan la misma información, así que procedemos a eliminar una de ellas: 

In [None]:
df.drop(["listed_in"], axis = 1, inplace = True)
df.head(2)

## Limpieza `director`, `cast`, `country`

Estas columnas eran las que tenían valores nulos

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]

In [None]:
# eliminamos los valores nulos de las columnas "director", "cast", "country" por Unknown
df[["director", "cast", "country"]]= df[["director", "cast", "country"]].fillna("Unknown")

In [None]:
df.head(2)

Imagineos que queremos cambiar el orden de las columnas, cambiemos el orden usando el método `reindex`. 

In [None]:
# creamos una lista con el nuevo orden que queremos
new_order = ['title', 'director', 'cast', 'country', 'language', 'rating', 'genre',
       'premiere', 'date_added', 'duration', 'imdb_score' ]

# aplicamos el método reindex
df = df.reindex(columns=new_order)
df.head(2)

# Limpiamos la columna de `duration` 

In [None]:
# lo primero que hacemos es ver los valores únicos

df["duration"].unique()

In [None]:
# como todos los valores tienen la misma estructura, lo único que tenemos que hacer es quitarle el min. 

df["duration"] = df["duration"].str.split(" ", expand = True).get(0)
df.head()

In [None]:
# esta columna era de tipo object, veamos ahora si es de tipo numérico
df.dtypes

In [None]:
# la columna "duration" sigue siendo de tipo object, pero esto no tiene sentido, convirtamosla a numérica
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')

In [None]:
# confirmamos la conversion de los valores
df.dtypes

In [None]:
# para los nulos de duración, los reemplazaremos por la mediana
df[df["duration"].isnull()].head()



In [None]:
# en este caso sólo tenemos valores nulos en "Movies", por lo que necesitamos calcular la duración mediana para este tipo de información
median_movies = df["duration"].median()

# Sustituir los valores nulos de la columna "duration" por el valor medio de la columna "duración" para el tipo "Movie".
df["duration"] = df["duration"].fillna(median_movies)



In [None]:
# check the null values again
df.isnull().sum()

# Limpiamos `date_added` y `premiere`

In [None]:
df.head(2)

In [None]:
# convertimos al fecha a datetime
df["premiere"] = pd.to_datetime(df[ "premiere"])
df["date_added"] = pd.to_datetime(df["date_added"])

In [None]:
# confirmamos la transformación
df.dtypes

In [None]:
df.head(2)

## Clean `cast` y `director`


In [None]:
df_act_dire = df.copy()

In [None]:
df["cast"].unique().tolist()[1]

In [None]:
# lo primero que tenemos que hacer es separar cada ector y actriz

df_act_dire["cast"] = df_act_dire["cast"].str.split(",")
df_act_dire.head(2)

In [None]:
# separamos cada actor en una fila nueva
df_actores = df_act_dire.explode("cast")
df_actores.head()

Aplicamos la misma lógica para la columna de directores

In [None]:
df["director"].unique().tolist()[16]

In [None]:
df_act_dire["director"] = df_act_dire["director"].str.split(",")
df_act_dire.head(2)

In [None]:
# separamos cada actor en una fila nueva
df_directores = df_act_dire.explode("director")
df_directores.head(3)

## Limpiamos `genre` 


In [None]:
# vemos los valores únicos

print(df["genre"].unique().tolist())

In [None]:
# para facilitar la limpieza lo primero qye hacemos es poner todos los géneros en minúscula

df["genre"] = df["genre"].str.lower()

In [None]:
# comprobamos que están en minúsculas

print(df["genre"].unique().tolist())

In [None]:
df['genre2'] = df['genre'].str.split("/")
df.head(2)


In [None]:
# usando la librería Counter, calculamos los géneros más comunes.
cuenta_generos = Counter(genres for genres in df['genre'])
print(f"Hay {len(cuenta_generos)} generos diferentes.")

In [None]:
comunes = dict(cuenta_generos.most_common(10))
comunes

In [None]:
fuzz.ratio('romantic comedy', "documentary")

In [None]:
def generos_(col, generos_comunes):
    maximo = 0
    for key in generos_comunes.keys():
        parecido = fuzz.ratio(col, key)
        if parecido > maximo:
            maximo = parecido
            genero = key
            
    # nos aseguramos de que los géneros se parezcan en al menos un 90%        
    if maximo > 50:
        return genero
    else:
        return "Other"
            

In [None]:
df["genre2"] = df.apply(lambda x: generos_(x["genre"], comunes), axis = 1)

In [None]:
df["genre2"].value_counts()

# Top 10 directores y actores

In [None]:
df_actores.head()

In [None]:
top_actores = df_actores["cast"].value_counts().reset_index()
top_actores.head()

In [None]:
lista_top_actores = top_actores.iloc[1:11,:]["index"].tolist()
lista_top_actores = [actor.strip() for actor in lista_top_actores]
print(lista_top_actores)

In [None]:
df_directores.head(2)

In [None]:
top_directores = df_directores["director"].value_counts().reset_index()
top_directores.head()

In [None]:
lista_top_directores = top_directores.iloc[1:11,:]["index"].tolist()
lista_top_directores = [director.strip() for director in lista_top_directores]
print(lista_top_directores)

In [None]:
# guardamos estos resultados para usarlos mañana
with open ('../data/pickle/top10_directors.pickle', "wb") as dire:
    pickle.dump(lista_top_directores, dire)
    
with open ('../data/pickle/top10_actors.pickle', "wb") as acto:
    pickle.dump(lista_top_actores, acto)