# Datasets 2019
Para cada año tenemos 3 tablas a inspeccionar, limpiar y unir:
- `title_basics`: pública y descargada de IMDb
- `title_ratings`: pública y descargada de IMDb
- `movies`: propia, escrapeada de la web de IMDb para todas las películas del año correspondiente

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
anno = 2019

In [3]:
# Tabla descargada de IMDb

title_basics = pd.read_csv('../data/imdb/title_basics.tsv', sep='\t', na_values='\\N')
print(title_basics.shape)
title_basics.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(8084314, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [4]:
# Tabla descargada de IMDb

title_ratings = pd.read_csv('../data/imdb/title_ratings.tsv', sep='\t')
print(title_ratings.shape)
title_ratings.head()

(1171920, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1722
1,tt0000002,6.0,211
2,tt0000003,6.5,1484
3,tt0000004,6.1,124
4,tt0000005,6.2,2286


In [5]:
# Películas de escrapeadas de la web IMDb

movies = pd.read_csv('../data/web_imdb/movies_df_'+ str(anno) + '.csv', sep=';', na_values=["[]", "['']", "['', '']", "['', '', '']"])
print(movies.shape)
movies.head()

(17181, 20)


Unnamed: 0,imdb_id,title,original_title,year,certificate,duration,directors,writers,stars,genres,countries,companies,imdb_rating,metascore,popularity,awards,budget,gross_us_canada,opening_us_canada,gross_world
0,tt0011216,La fête espagnole,,2019.0,,,['Germaine Dulac'],['Louis Delluc'],"['Ève Francis', 'Gabriel Gabrio', 'Jean Toulout']",['Drama'],['France'],['Les Films Louis Nalpas'],6.9,,,,,,,
1,tt0011801,Tötet nicht mehr,,2019.0,,,['Lupu Pick'],"['Gerhard Lamprecht', 'Lupu Pick']","['Lupu Pick', 'Edith Posca', 'Johannes Riemann']","['Action', 'Crime']",['Germany'],['Rex-Film GmbH'],,,,,,,,
2,tt0116991,Mariette in Ecstasy,,2019.0,PG-13,1h 41min,['John Bailey'],['Ron Hansen'],"[""Geraldine O'Rawe"", 'Eva Marie Saint', 'Alex ...",['Drama'],['United States'],"['Price Entertainment', 'Savoy Pictures']",7.6,,,,,,,
3,tt0170155,A Letter from Beirut,,2019.0,,,['Jocelyn Saab'],,,['Documentary'],['France'],['France Régions 3 (FR3)'],6.8,,,,,,,
4,tt0195933,Mysteries,,2019.0,,,['Gregory J. Markopoulos'],,,,['United States'],,7.0,,,,,,,


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17181 entries, 0 to 17180
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   imdb_id            17181 non-null  object 
 1   title              17168 non-null  object 
 2   original_title     775 non-null    object 
 3   year               17134 non-null  float64
 4   certificate        2358 non-null   object 
 5   duration           2314 non-null   object 
 6   directors          16481 non-null  object 
 7   writers            15096 non-null  object 
 8   stars              11126 non-null  object 
 9   genres             16417 non-null  object 
 10  countries          16624 non-null  object 
 11  companies          12533 non-null  object 
 12  imdb_rating        9413 non-null   float64
 13  metascore          814 non-null    float64
 14  popularity         262 non-null    object 
 15  awards             4463 non-null   object 
 16  budget             466

---
## Número de películas

En primer lugar verificar si hay una discrepancia entre el número de películas obtenidas de title_basics.tsv de IMDb
y el número de películas resultantes del scrapping.

In [7]:
# Cargar las películas de la base de datos de IMDb en una lista

imdb_ids = title_basics[(title_basics.titleType=='movie') & (title_basics.startYear==anno)]
imdb_ids = list(imdb_ids['tconst'])
len(imdb_ids)

17181

In [8]:
scrap_ids = list(movies['imdb_id'])
len(scrap_ids)

17181

---
## 1. Bd de IMDb: `title_basics`

### Variable `runtimeMinutes`  

In [9]:
title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8084314 entries, 0 to 8084313
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         float64
 5   startYear       float64
 6   endYear         float64
 7   runtimeMinutes  object 
 8   genres          object 
dtypes: float64(3), object(6)
memory usage: 555.1+ MB


#### Convertir `runtimeMinutes` en float
La siguiente función convierte los valores en float si se puede de manera natural, y si no se puede los imprime y convierte a nulo.

In [10]:
def to_float(n):
    try:
        n = float(n)
        return n
    except:
        print(n)
        return np.nan

In [11]:
title_basics.runtimeMinutes = title_basics.runtimeMinutes.apply(to_float)
title_basics.runtimeMinutes

Reality-TV
Documentary
Talk-Show
Game-Show
Reality-TV
Animation,Comedy,Family
Reality-TV


0           1.0
1           5.0
2           4.0
3          12.0
4           1.0
           ... 
8084309     NaN
8084310     NaN
8084311     NaN
8084312    27.0
8084313    10.0
Name: runtimeMinutes, Length: 8084314, dtype: float64

#### Eliminar de `title_basics` los registros con outliers en `runtimeMinutes`

In [12]:
# Buscar outliers mayores de 1.000 minutos y menores 15
# El resltado es un listado de películas fake, cortos o compendios de pelis.
# Eliminar todos estos registros del scrapping

title_basics = title_basics[(title_basics.titleType=='movie') & (title_basics.startYear==anno)]

title_basics = title_basics[~((title_basics.runtimeMinutes<15) | (title_basics.runtimeMinutes>500))]
title_basics.shape

(17171, 9)

In [13]:
# Una vez eliminados los outliers estos son los estadísticos:

title_basics.runtimeMinutes.describe().round(2)

count    13397.00
mean        91.67
std         26.53
min         25.00
25%         75.00
50%         90.00
75%        104.00
max        480.00
Name: runtimeMinutes, dtype: float64

---
## 2. Bd de IMDb: `title_ratings`

In [14]:
title_ratings = pd.read_csv('../data/imdb/title_ratings.tsv', sep='\t')
print(title_ratings.shape)
title_ratings.head()

(1171920, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1722
1,tt0000002,6.0,211
2,tt0000003,6.5,1484
3,tt0000004,6.1,124
4,tt0000005,6.2,2286



---
## 3. Bd propia escrapeada del portal IMDb: `movies`

## `year`
La base de datos de IMDb descargada tiene algunas discrepancias en cuanto al año de las películas, no coindice el año en la base de datos descargada de IMDb con el año del portal web, en el caso del año 2019 ocurre en 12 registros. Existe algún caso en el que tampoco coincide el año de la película en el portal Metacritic. Hay varias formas de afrontar estas discrepancias, en este caso, y por simplificación, utilizaré el año de la base de datos de IMDb.
Hay también algunos registros en la base de datos con valores nulos para el año y demás columnas, y serán eliminados (47 registors en total).

In [15]:
# Películas escrapeadas con año diferente
len(movies[(movies['year']!=anno) & movies['year'].notnull()])

12

In [16]:
movies[(movies['year']!=anno) & movies['year'].notnull()]

Unnamed: 0,imdb_id,title,original_title,year,certificate,duration,directors,writers,stars,genres,countries,companies,imdb_rating,metascore,popularity,awards,budget,gross_us_canada,opening_us_canada,gross_world
2294,tt10461238,Scorpio,,2020.0,,,['Julian Noble'],['Julian Noble'],"['Cecilia Camarena', 'Margarita Chavarría', 'A...",['Drama'],['Mexico'],,,,,,,,,
4524,tt10929218,Black,,2018.0,,,['David J. Buchanan'],['David J. Buchanan'],"['Toussaint Morrison', 'Geoff Briley', 'Malick...",['Drama'],['United States'],,1.7,,,Awards\n2 nominations,,,,
8439,tt12538404,Los versos salvados,,2021.0,,,['Gabriel Szollosy'],"['Celina Galeano', 'Fernanda Galeano']",,['Documentary'],,,,,,,,,,
10213,tt3465026,Palm Swings,,2017.0,Unrated,1h 35min,['Sean Hoessli'],['Amanda Lockhart'],"['Tia Carrere', 'Jason Lewis', 'Diane Farr']","['Comedy', 'Drama', 'Romance']",['United States'],['Code Blue Pictures'],4.4,,,,,,,
10541,tt4762486,Waging Change,,2020.0,,,['Abby Ginzberg'],"['Mark Bittman', 'Jane Fonda', 'Saru Jayaraman']",,['Documentary'],['United States'],['Social Action Media'],4.2,,,,,,,
10814,tt5565254,Wetlands,,2017.0,TV-MA,1h 38min,['Emanuele Della Valle'],['Emanuele Della Valle'],"['Adewale Akinnuoye-Agbaje', 'Heather Graham',...","['Crime', 'Drama', 'Thriller']",['United States'],['Wetlands Productions'],4.3,37.0,,,"$20,000,000",,,
11393,tt6535838,Woodstock or Bust,,2018.0,Not Rated,1h 30min,['Leslie Bloom'],"['Judi Blaze', 'Leslie Bloom']","['Willow Shields', 'Meg DeLacy', 'Teddy Van Ee']",['Drama'],['United States'],['Big Kid Films'],4.2,,,Awards\n8 wins & 3 nominations,,,,
11462,tt6666116,Summertime Dropouts,,2022.0,,,['Jhene Chase'],"['Jhene Chase', 'Hannah Maslinski', 'Hamid Tor...","['Quinton Aaron', 'Serena Laurel', 'Josh Richa...","['Comedy', 'Family']",['United States'],['Winter State Entertainment'],,,,,,,,
12080,tt7391064,Petersburg,,2021.0,,,['Mary Sue Connolly'],"['Kelly Beazley', 'Kevin Bowman', 'Brandi Brai...",,['Documentary'],['United States'],,,,,Awards\n5 nominations,,,,
13195,tt8291224,Uri: The Surgical Strike,,2018.0,Not Rated,2h 18min,['Aditya Dhar'],['Aditya Dhar'],"['Vicky Kaushal', 'Paresh Rawal', 'Mohit Raina']","['Action', 'Drama', 'War']",['India'],"['Bulb Chamka', 'RSVP']",8.2,,,Awards\n17 wins & 24 nominations,"₹450,000,000","$4,186,168","$631,146","$39,346,577"


In [17]:
title_basics[title_basics['tconst']=='tt10461238']

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1231149,tt10461238,movie,Scorpio,Scorpio,0.0,2019.0,,127.0,Drama


#### Eliminar del df escrapeado las películas de años posteriores a 2019
(son pelis futuras que se han colado)

In [18]:
movies = movies[~(movies['year']>2019)]
len(movies)

17174

#### Eliminar del df escrapeado las películas sin año
(son pelis sin información o url en el portal)

In [19]:
# Películas escrapeadas con año null, no tienen casi información, serán eliminadas
movies = movies[movies['year'].notnull()]
len(movies)

17127

## `imdb_rating`
Esta variable será sustituida por la de la base de datos `title_rating` descargada de IMDb

In [20]:
movies[['imdb_rating']].describe().round(2)

Unnamed: 0,imdb_rating
count,9410.0
mean,6.17
std,1.49
min,1.0
25%,5.3
50%,6.3
75%,7.2
max,10.0


## `metascore`
Comparing to: https://www.metacritic.com/feature/best-movies-released-in-2019

Hay pequeñas discrepancias en el listado de películas, debido a las fechas de estreno. Metacritic considera la fecha de estreno en USA. IMDb la fecha de estreno mundial. Pero sí que coinciden las puntuaciones y la mayoría de películas asignadas a 2019.

In [21]:
movies[['imdb_id', 'title', 'original_title', 'imdb_rating', 'metascore', 'year']][movies.metascore>=82].sort_values('metascore', ascending=False)

Unnamed: 0,imdb_id,title,original_title,imdb_rating,metascore,year
11518,tt6751668,Parásitos,Gisaengchung,8.6,96.0,2019.0
14696,tt9067182,Rocks,,7.5,96.0,2019.0
13709,tt8613070,Retrato de una mujer en llamas,Portrait de la jeune fille en feu,8.1,95.0,2019.0
3522,tt10706602,Collective,Colectiv,8.2,95.0,2019.0
8861,tt1302006,El irlandés,The Irishman,7.8,94.0,2019.0
...,...,...,...,...,...,...
10761,tt5363618,Sound of Metal,,7.8,82.0,2019.0
10370,tt4169146,Give Me Liberty,,6.7,82.0,2019.0
10126,tt2762506,Bacurau,,7.4,82.0,2019.0
9005,tt13236440,Lost Course,,8.2,82.0,2019.0


In [22]:
np.sort(movies.metascore.dropna().unique())

array([ 8., 10., 12., 13., 16., 17., 18., 19., 21., 22., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52.,
       53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
       66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78.,
       79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91.,
       94., 95., 96.])

In [23]:
movies.metascore.describe()

count    813.000000
mean      60.928659
std       16.190593
min        8.000000
25%       51.000000
50%       63.000000
75%       73.000000
max       96.000000
Name: metascore, dtype: float64

## `popularity`
Valor que vamos a descartar porque es muy volátil, cambia continuamente. Así, no guarda relación temporal con la taquilla que es el objeto de estudio.

In [24]:
# Convertirlo a número
movies['popularity'] = movies.popularity.str.replace(',', '').astype(float)

In [25]:
movies.popularity.describe()

count     261.000000
mean     2172.670498
std      1417.215463
min        23.000000
25%       920.000000
50%      2047.000000
75%      3329.000000
max      4946.000000
Name: popularity, dtype: float64

In [26]:
movies[movies.title=='Five Feet Apart']

Unnamed: 0,imdb_id,title,original_title,year,certificate,duration,directors,writers,stars,genres,countries,companies,imdb_rating,metascore,popularity,awards,budget,gross_us_canada,opening_us_canada,gross_world


## `directors`, `writers`, `stars`, `countries`, `companies`
Convertir a: comma separated values

In [27]:
movies['directors'] = movies['directors'].str.replace('[', '', regex=False) \
                                         .str.replace(']', '', regex=False) \
                                         .str.replace("'", "", regex=False) \
                                         .str.replace(", ", ",", regex=False)

In [28]:
movies['writers'] = movies['writers'].str.replace('[', '', regex=False) \
                                     .str.replace(']', '', regex=False) \
                                     .str.replace("'", "", regex=False) \
                                     .str.replace(", ", ",", regex=False)

In [29]:
movies['stars'] = movies['writers'].str.replace('[', '', regex=False) \
                                   .str.replace(']', '', regex=False) \
                                   .str.replace("'", "", regex=False) \
                                   .str.replace(", ", ",", regex=False)

In [30]:
movies['countries'] = movies['countries'].str.replace('[', '', regex=False) \
                                         .str.replace(']', '', regex=False) \
                                         .str.replace("'", "", regex=False) \
                                         .str.replace(", ", ",", regex=False)

In [31]:
movies['companies'] = movies['companies'].str.replace('[', '', regex=False) \
                                         .str.replace(']', '', regex=False) \
                                         .str.replace("'", "", regex=False) \
                                         .str.replace(", ", ",", regex=False)

## `opening_us_canada`, `gross_us_canada`, `gross_world`
Convertir estos valores a números, eliminando el signo del dólar

In [32]:
def remove_dolar(amount):
    if str(amount) == 'nan':
        return np.nan
    else:        
        return float(amount.replace(',', '')[1:]) 

In [33]:
movies['gross_us_canada'] = movies['gross_us_canada'].apply(remove_dolar)
movies['gross_us_canada']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
17176   NaN
17177   NaN
17178   NaN
17179   NaN
17180   NaN
Name: gross_us_canada, Length: 17127, dtype: float64

In [34]:
movies['opening_us_canada'] = movies['opening_us_canada'].apply(remove_dolar)
movies['opening_us_canada']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
17176   NaN
17177   NaN
17178   NaN
17179   NaN
17180   NaN
Name: opening_us_canada, Length: 17127, dtype: float64

In [35]:
movies['gross_world'] = movies['gross_world'].apply(remove_dolar)
movies['gross_world']

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
17176    4540085.0
17177          NaN
17178          NaN
17179    4408165.0
17180          NaN
Name: gross_world, Length: 17127, dtype: float64

## `budget`
### Normalización `budget` a dólares
Son necesarios varios procesos para tener un valor válido de la variable `budget`:
- Separar, con regex, el texto que corresponde a la moneda y el que corresponde a la cantidad (ej. '€195,000'). Estos dos textos serán dos nuevas columnas `budgetCurrency` y `budgetAmount`
- Crear un diccionario con la correspondencia del código usado para las monedas en IMDb y el código ISO
- Crear un archivo externo que contenga las tasas de cambio para cada moneda y año
- Crear una nueva columna `budgetInDolars` con el valor ya normalizado a dólares

In [36]:
import re

def clean_amount_currency(raw_amount):
    
    if str(raw_amount) == 'nan' :
        return (np.nan, np.nan)
    
    # Diccionario con correspondencia del símbolo de moneda de IMDb y el ISO de monedas
    # Monedas de 2019 sin tasas de cambio: 'LVL'
    currency_to_iso = {'$':'USA', 'A$':'AUS', 'ZAR':'ZAF', '₹':'IND', '€':'EU27_2020',
                       'RUR':'RUS', 'R$':'BRA', 'CN¥':'CHN', 'TRL':'TUR', 'CA$':'CAN',
                       'NPR':'NPR', '£':'GBR', '¥':'JPN', 'HUF':'HUN', 'HRK':'HRV',
                       'NOK':'NOR', 'MNT':'MNT', 'CHF':'CHE', 'MX$':'MEX', 'UAH':'UAH',
                       'HK$':'HKG', 'PHP':'PHP', 'PLN':'POL', 'EGP':'EGP' , 'IRR':'IRR',
                       'NZ$':'NZL', 'AED':'AED', 'DKK':'DNK', 'NGN':'NGN', 'SEK':'SWE',
                       'BDT':'BDT', 'MYR':'MYR', 'CZK':'CZE', 'ARS':'ARG', 'PKR':'PKR',
                       'GEL':'GEL', 'NT$':'TWD', '₪':'ISR', '₫':'VND', '₩':'KOR', 'THB':'THB',
                       'RON':'ROU', 'PTE':'PRT', 'TTD':'TTD', 'MVR':'MVR', 'JOD':'JOD',
                       'ISK':'ISL', 'COP':'COL', 'LKR':'LKR', 'BSD':'BSD', 'BGL':'BGR',
                       'KZT':'KZT', 'KES':'KES', 'TZS':'TZS', 'DOP':'DOP', 'TJS':'TJS',
                       'TND':'TND', 'MUR':'MUR', 'SGD':'SGD', 'MOP':'MOP', 'IDR':'IDN',
                       'ETB':'ETB', 'CUP':'CUP', 'BYR':'BYR', 'MMK':'MMK', 'NAD':'NAD',
                       'PEN':'PER', 'MKD':'MKD', 'UGX':'UGX', 'BAM':'BAM', 'ALL':'ALL',
                       'KGS':'KGS', 'GHC':'GHS'
                      }
    
    raw_amount = raw_amount.replace(',', '')
    
    for item in re.finditer(r"^(\D+)(\d+)$", raw_amount):
        try:
            currency = currency_to_iso[item.group(1).strip()]
        except:
            currency = str(np.nan)
        amount = float(item.group(2))

    return(currency, amount)



# Dividir la info de budget en dos nuevas columnas (cantidad y moneda)
movies['budgetCurrency'], movies['budgetAmount'] = zip(*movies['budget'].map(clean_amount_currency))
movies[['title', 'budget', 'budgetAmount', 'budgetCurrency']].dropna()

# importar archivo csv de tasas de cambio
exchange_rates=pd.read_csv('../data/OECD/DP_LIVE_16072021155836489.csv')

# Seleccionar las tasas de cambio para el año del dataset
exchange_rates_anno = exchange_rates[exchange_rates.TIME == anno]

# Merge tasas de cambio con df movies, usando como primary key el código ISO de la moneda
movies = pd.merge(movies, exchange_rates_anno[['LOCATION','Value']], how='left', left_on='budgetCurrency', right_on='LOCATION')
movies['budgetInDollars'] = movies['budgetAmount'] / movies['Value']

movies.head()

Unnamed: 0,imdb_id,title,original_title,year,certificate,duration,directors,writers,stars,genres,...,awards,budget,gross_us_canada,opening_us_canada,gross_world,budgetCurrency,budgetAmount,LOCATION,Value,budgetInDollars
0,tt0011216,La fête espagnole,,2019.0,,,Germaine Dulac,Louis Delluc,Louis Delluc,['Drama'],...,,,,,,,,,,
1,tt0011801,Tötet nicht mehr,,2019.0,,,Lupu Pick,"Gerhard Lamprecht,Lupu Pick","Gerhard Lamprecht,Lupu Pick","['Action', 'Crime']",...,,,,,,,,,,
2,tt0116991,Mariette in Ecstasy,,2019.0,PG-13,1h 41min,John Bailey,Ron Hansen,Ron Hansen,['Drama'],...,,,,,,,,,,
3,tt0170155,A Letter from Beirut,,2019.0,,,Jocelyn Saab,,,['Documentary'],...,,,,,,,,,,
4,tt0195933,Mysteries,,2019.0,,,Gregory J. Markopoulos,,,,...,,,,,,,,,,


---
# Merge de tablas


In [37]:
imdb_merged = pd.merge(title_basics[(title_basics.titleType=='movie') & (title_basics.startYear==anno)] , title_ratings, on='tconst', how='left')
imdb_merged

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0011216,movie,Spanish Fiesta,La fête espagnole,0.0,2019.0,,67.0,Drama,6.9,21.0
1,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0.0,2019.0,,,"Action,Crime",,
2,tt0116991,movie,Mariette in Ecstasy,Mariette in Ecstasy,0.0,2019.0,,101.0,Drama,7.6,47.0
3,tt0170155,movie,A Letter from Beirut,A Letter from Beirut,0.0,2019.0,,50.0,Documentary,6.8,9.0
4,tt0195933,movie,Mysteries,Mysteries,0.0,2019.0,,,,7.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...
17166,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0.0,2019.0,,97.0,"Comedy,Drama,Fantasy",7.3,6.0
17167,tt9916160,movie,Drømmeland,Drømmeland,0.0,2019.0,,72.0,Documentary,6.3,44.0
17168,tt9916170,movie,The Rehearsal,O Ensaio,0.0,2019.0,,51.0,Drama,7.2,5.0
17169,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0.0,2019.0,,,"Adventure,History,War",3.8,12.0


### Renombrar y seleccionar columnas de ambos datasets

In [38]:
imdb_merged.rename(columns={'primaryTitle': 'englishTitle',
                            'tconst': 'imdbId',
                            'averageRating': 'ratingImdb'
                          }, inplace=True)

In [39]:
imdb_merged.columns

Index(['imdbId', 'titleType', 'englishTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres', 'ratingImdb',
       'numVotes'],
      dtype='object')

In [40]:
movies.rename(columns={'imdb_id': 'imdbId',
                       'title': 'spanishTitle',
                       'budget': 'budgetString',
                       'budgetInDollars': 'budget',
                       'gross_us_canada': 'grossUsCanada',
                       'opening_us_canada': 'openingUsCanada',
                       'gross_world': 'grossWorld',
                      }, inplace=True)

In [41]:
movies.columns

Index(['imdbId', 'spanishTitle', 'original_title', 'year', 'certificate',
       'duration', 'directors', 'writers', 'stars', 'genres', 'countries',
       'companies', 'imdb_rating', 'metascore', 'popularity', 'awards',
       'budgetString', 'grossUsCanada', 'openingUsCanada', 'grossWorld',
       'budgetCurrency', 'budgetAmount', 'LOCATION', 'Value', 'budget'],
      dtype='object')

In [42]:
# Selección de columnas de imdb
imdb_merged = imdb_merged[['imdbId', 'englishTitle', 'originalTitle', 'isAdult', 'runtimeMinutes', 'genres', 'ratingImdb', 'numVotes']]

In [43]:
# Selección de columnas del escrapeado
movies = movies[['imdbId', 'spanishTitle', 'year', 'certificate',
                'directors', 'writers', 'stars', 'countries', 'companies',
                'metascore', 'awards', 'budget', 'grossUsCanada',
                'openingUsCanada', 'grossWorld']]

### Merge final para este año

In [44]:
movies_merged = pd.merge(imdb_merged, movies, on='imdbId', how='inner')
movies_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17117 entries, 0 to 17116
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   imdbId           17117 non-null  object 
 1   englishTitle     17117 non-null  object 
 2   originalTitle    17117 non-null  object 
 3   isAdult          17117 non-null  float64
 4   runtimeMinutes   13390 non-null  float64
 5   genres           16363 non-null  object 
 6   ratingImdb       9383 non-null   float64
 7   numVotes         9383 non-null   float64
 8   spanishTitle     17117 non-null  object 
 9   year             17117 non-null  float64
 10  certificate      2357 non-null   object 
 11  directors        16455 non-null  object 
 12  writers          15070 non-null  object 
 13  stars            15070 non-null  object 
 14  countries        16600 non-null  object 
 15  companies        12518 non-null  object 
 16  metascore        813 non-null    float64
 17  awards      

In [45]:
# Reordenar columnas

movies_merged = movies_merged[['imdbId', 'year', 'spanishTitle', 'originalTitle',
                               'englishTitle', 'ratingImdb', 'numVotes', 'metascore',
                               'isAdult', 'certificate', 'runtimeMinutes', 'genres', 'directors',
                               'writers', 'stars', 'countries', 'companies', 'awards',
                               'budget', 'grossUsCanada', 'openingUsCanada', 'grossWorld']]

In [46]:
movies_merged

Unnamed: 0,imdbId,year,spanishTitle,originalTitle,englishTitle,ratingImdb,numVotes,metascore,isAdult,certificate,...,directors,writers,stars,countries,companies,awards,budget,grossUsCanada,openingUsCanada,grossWorld
0,tt0011216,2019.0,La fête espagnole,La fête espagnole,Spanish Fiesta,6.9,21.0,,0.0,,...,Germaine Dulac,Louis Delluc,Louis Delluc,France,Les Films Louis Nalpas,,,,,
1,tt0011801,2019.0,Tötet nicht mehr,Tötet nicht mehr,Tötet nicht mehr,,,,0.0,,...,Lupu Pick,"Gerhard Lamprecht,Lupu Pick","Gerhard Lamprecht,Lupu Pick",Germany,Rex-Film GmbH,,,,,
2,tt0116991,2019.0,Mariette in Ecstasy,Mariette in Ecstasy,Mariette in Ecstasy,7.6,47.0,,0.0,PG-13,...,John Bailey,Ron Hansen,Ron Hansen,United States,"Price Entertainment,Savoy Pictures",,,,,
3,tt0170155,2019.0,A Letter from Beirut,A Letter from Beirut,A Letter from Beirut,6.8,9.0,,0.0,,...,Jocelyn Saab,,,France,France Régions 3 (FR3),,,,,
4,tt0195933,2019.0,Mysteries,Mysteries,Mysteries,7.0,6.0,,0.0,,...,Gregory J. Markopoulos,,,United States,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17112,tt9915872,2019.0,My Girlfriend is a Wizard,My Girlfriend is a Wizard,The Last White Witch,7.3,6.0,,0.0,,...,Hideki Kiyota,Ryuho Okawa,Ryuho Okawa,Japan,,,,,,4540085.0
17113,tt9916160,2019.0,Drømmeland,Drømmeland,Drømmeland,6.3,44.0,,0.0,,...,Joost van der Wiel,Nils Leidal,Nils Leidal,Netherlands,Conijn Film,,2.182976e+05,,,
17114,tt9916170,2019.0,O Ensaio,O Ensaio,The Rehearsal,7.2,5.0,,0.0,,...,Tamar Guimaraes,"Tamar Guimaraes,Lillah Halla,Melissa de Raaf","Tamar Guimaraes,Lillah Halla,Melissa de Raaf","Brazil,Denmark",,,,,,
17115,tt9916428,2019.0,Hong xing zhao yao Zhong guo,Hong xing zhao yao Zhong guo,The Secret of China,3.8,12.0,,0.0,,...,Jixing Wang,"Kenan Heppe,Wang Peng Kai,Valery Gadreau","Kenan Heppe,Wang Peng Kai,Valery Gadreau",China,"Emei Film Group,Greenland Holding Group,Propag...",Awards\n2 nominations,1.000000e+07,,,4408165.0


## Feature engineering. Columnas `profit` y `roi`
Los estadísticos están distorsionados por los valores nulos que se eliminarán después

In [47]:
movies_merged['profit'] = movies_merged.grossWorld - movies_merged.budget
movies_merged['profit'].describe()

count    5.100000e+02
mean     4.194192e+07
std      1.826747e+08
min     -1.580311e+08
25%     -2.509620e+06
50%     -4.426929e+05
75%      4.797220e+06
max      2.441501e+09
Name: profit, dtype: float64

In [48]:
movies_merged['roi'] = (movies_merged.grossWorld - movies_merged.budget) / movies_merged.budget
movies_merged['roi'].describe()

count       510.000000
mean       1703.928583
std       18051.958010
min          -0.999848
25%          -0.963720
50%          -0.667550
75%           1.187220
max      241889.208040
Name: roi, dtype: float64

---
# Inspección inicial de correlaciones
## Registros no nulos con los que hacer las correlaciones de valoraciones y recaudación
Valoraciones: imdb_rating, metascore

Recaudación: budget, gross_world


In [49]:
print('Registros con ratingImdb:', len(movies_merged[movies_merged.ratingImdb.notnull()]))
print('Registros con metascore:', len(movies_merged[movies_merged.metascore.notnull()]))

print('Registros con budget:', len(movies_merged[movies_merged.budget.notnull()]))
print('Registros con grossUsCanada:', len(movies_merged[movies_merged.grossUsCanada.notnull()]))
print('Registros con openingUsCanada:', len(movies_merged[movies_merged.openingUsCanada.notnull()]))
print('Registros con grossWorld:', len(movies_merged[movies_merged.grossWorld.notnull()]))

print('\nRegistros con las rating, metascore, budget, grossWorld:',len(movies_merged[movies_merged.budget.notnull() & movies_merged.grossWorld.notnull() & movies_merged.ratingImdb.notnull() & movies_merged.metascore.notnull()]))

Registros con ratingImdb: 9383
Registros con metascore: 813
Registros con budget: 4654
Registros con grossUsCanada: 545
Registros con openingUsCanada: 473
Registros con grossWorld: 2972

Registros con las rating, metascore, budget, grossWorld: 187


In [50]:
movies_merged = movies_merged[movies_merged.budget.notnull() & movies_merged.grossWorld.notnull() & movies_merged.ratingImdb.notnull() & movies_merged.metascore.notnull()]
movies_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187 entries, 8 to 16845
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   imdbId           187 non-null    object 
 1   year             187 non-null    float64
 2   spanishTitle     187 non-null    object 
 3   originalTitle    187 non-null    object 
 4   englishTitle     187 non-null    object 
 5   ratingImdb       187 non-null    float64
 6   numVotes         187 non-null    float64
 7   metascore        187 non-null    float64
 8   isAdult          187 non-null    float64
 9   certificate      179 non-null    object 
 10  runtimeMinutes   187 non-null    float64
 11  genres           187 non-null    object 
 12  directors        187 non-null    object 
 13  writers          187 non-null    object 
 14  stars            187 non-null    object 
 15  countries        187 non-null    object 
 16  companies        187 non-null    object 
 17  awards        

## Correlaciones de valoraciones y recaudación

In [51]:
movies_merged.describe().round(2)

Unnamed: 0,year,ratingImdb,numVotes,metascore,isAdult,runtimeMinutes,budget,grossUsCanada,openingUsCanada,grossWorld,profit,roi
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0,150.0,148.0,187.0,187.0,187.0
mean,2019.0,6.38,87106.81,55.74,0.0,110.97,42097510.0,72224630.0,23007990.0,155578300.0,113480800.0,2.28
std,0.0,0.94,145116.19,16.39,0.0,19.71,58825610.0,121009100.0,42509340.0,332239900.0,287416500.0,4.23
min,2019.0,2.8,49.0,10.0,0.0,77.0,150000.0,1460.0,452.0,2627.0,-158031100.0,-1.0
25%,2019.0,5.8,8991.5,45.5,0.0,97.0,6000000.0,10261640.0,2845758.0,2947860.0,-2751165.0,-0.63
50%,2019.0,6.4,32256.0,55.0,0.0,108.0,18000000.0,32754880.0,9618864.0,43347020.0,18386560.0,0.88
75%,2019.0,6.9,89721.5,67.0,0.0,122.0,48000000.0,72099920.0,24808510.0,147227000.0,92645010.0,3.49
max,2019.0,8.6,1023267.0,96.0,0.0,209.0,356000000.0,858373000.0,357115000.0,2797501000.0,2441501000.0,21.71


In [52]:
method = 'pearson'

print(anno, 'Corr Método:', method)

movies_merged[['ratingImdb', 'metascore', 'budget', 'grossWorld', 'profit', 'roi']].corr(method = method)

2019 Corr Método: pearson


Unnamed: 0,ratingImdb,metascore,budget,grossWorld,profit,roi
ratingImdb,1.0,0.641476,0.197698,0.304915,0.312004,0.292859
metascore,0.641476,1.0,0.092117,0.174454,0.182807,0.15386
budget,0.197698,0.092117,1.0,0.799099,0.719051,0.070679
grossWorld,0.304915,0.174454,0.799099,1.0,0.992401,0.372343
profit,0.312004,0.182807,0.719051,0.992401,1.0,0.415945
roi,0.292859,0.15386,0.070679,0.372343,0.415945,1.0


In [53]:
method = 'kendall'

print(anno, 'Corr Método:', method)

movies_merged[['ratingImdb', 'metascore', 'budget', 'grossWorld', 'profit', 'roi']].corr(method = method)

2019 Corr Método: kendall


Unnamed: 0,ratingImdb,metascore,budget,grossWorld,profit,roi
ratingImdb,1.0,0.48368,0.130944,0.210958,0.229282,0.186215
metascore,0.48368,1.0,0.003558,0.080311,0.135746,0.077528
budget,0.130944,0.003558,1.0,0.585153,0.305473,0.217104
grossWorld,0.210958,0.080311,0.585153,1.0,0.721925,0.634064
profit,0.229282,0.135746,0.305473,0.721925,1.0,0.634064
roi,0.186215,0.077528,0.217104,0.634064,0.634064,1.0


In [54]:
method = 'spearman'

print(anno, 'Corr Método:', method)

movies_merged[['ratingImdb', 'metascore', 'budget', 'grossWorld', 'profit', 'roi']].corr(method = method)

2019 Corr Método: spearman


Unnamed: 0,ratingImdb,metascore,budget,grossWorld,profit,roi
ratingImdb,1.0,0.648677,0.191879,0.303084,0.32692,0.266564
metascore,0.648677,1.0,0.003105,0.112071,0.197657,0.108703
budget,0.191879,0.003105,1.0,0.771022,0.457596,0.32404
grossWorld,0.303084,0.112071,0.771022,1.0,0.881336,0.814515
profit,0.32692,0.197657,0.457596,0.881336,1.0,0.84881
roi,0.266564,0.108703,0.32404,0.814515,0.84881,1.0


In [55]:
movies_merged[['ratingImdb', 'metascore', 'budget', 'grossUsCanada', 'openingUsCanada', 'grossWorld', 'profit', 'roi']].corr()

Unnamed: 0,ratingImdb,metascore,budget,grossUsCanada,openingUsCanada,grossWorld,profit,roi
ratingImdb,1.0,0.641476,0.197698,0.323907,0.246191,0.304915,0.312004,0.292859
metascore,0.641476,1.0,0.092117,0.234007,0.162839,0.174454,0.182807,0.15386
budget,0.197698,0.092117,1.0,0.782792,0.784216,0.799099,0.719051,0.070679
grossUsCanada,0.323907,0.234007,0.782792,1.0,0.965799,0.963722,0.955422,0.294598
openingUsCanada,0.246191,0.162839,0.784216,0.965799,1.0,0.945441,0.933937,0.243365
grossWorld,0.304915,0.174454,0.799099,0.963722,0.945441,1.0,0.992401,0.372343
profit,0.312004,0.182807,0.719051,0.955422,0.933937,0.992401,1.0,0.415945
roi,0.292859,0.15386,0.070679,0.294598,0.243365,0.372343,0.415945,1.0


---
# Guardado dataset a pickle

In [56]:
import pickle

with open('../data/web_imdb_clean/movies_2019.pickle', 'wb') as f:
    pickle.dump(movies_merged, f)