
<img src="img/viu_logo.png" width="200">

## 01MIAR - Procesamiento de Datos

<img src="img/This-is-fine.jpg" width="300">

*Ivan Fuertes*

*Benjamin Arroquia Cuadros*

In [2]:
import numpy as np
import pandas as pd
import zipfile as zp # para descomprimir archivos zip
import urllib.request # para descargar de URL
import os

### Uniendo datasets con 'join' y 'merge'
- merge() == join()
 - 'join' utiliza por defecto los índices para unir
- Utilizando el parámetro 'on'
 - Si las columnas difieren, 'left_on' y 'right_on'
- Diagrama de Venn para unión de tablas:

<img src="img/venn_diagram.png" width="500">

### Combinar varios datasets 
- En base a un elemento en común (índice)
- MovieLens 'UserId'

In [4]:

# descargar MovieLens dataset
# url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'  
local_zip = os.path.join("data", "ml-1m.zip")
# urllib.request.urlretrieve(url, local_zip)
# descomprimiendo archivo zip
with zp.ZipFile(local_zip, 'r') as zipp: 
    print('Extracting all files...') 
    zipp.extractall(os.path.join("res")) # destino
    print('Done!') 

Extracting all files...
Done!


In [5]:
ruta_users = os.path.join("res", "ml-1m", "users.dat")
ruta_ratings = os.path.join("res", "ml-1m", "ratings.dat")
ruta_movies = os.path.join("res", "ml-1m", "movies.dat")

users_dataset = pd.read_csv(ruta_users, sep='::', index_col=0,
    header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], engine='python', encoding="ISO-8859-1")

ratings_dataset = pd.read_csv(ruta_ratings, sep='::', index_col=0, 
    header=None, names=['UserID','MovieID','Rating','Timestamp'], engine='python', encoding="ISO-8859-1")

movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")

In [6]:
display(users_dataset.sample(5))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5358,M,35,7,10625
3834,M,18,2,2322
5187,M,18,16,6510
3982,M,56,13,95929
326,M,50,11,25302


In [7]:
display(ratings_dataset.sample(5))

Unnamed: 0_level_0,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4218,588,4,965317289
5198,141,2,961687691
3118,3198,3,969408454
5011,44,4,962637031
839,1288,4,980554888


In [8]:
movies_dataset

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
...,...,...
3948,Meet the Parents (2000),Comedy
3949,Requiem for a Dream (2000),Drama
3950,Tigerland (2000),Drama
3951,Two Family House (2000),Drama


In [17]:
movies_dataset.loc[movies_dataset.index == 3536] # devuelve un dataFrame pandas.core.frame.DataFrame
# se trabajan mas con DataFrame

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
3536,Keeping the Faith (2000),Comedy|Romance


In [15]:
movies_dataset.loc[3536]
# type(movies_dataset.loc[3536]) # Devuelve un tipo series pandas.core.series.Series

Title    Keeping the Faith (2000)
Genre              Comedy|Romance
Name: 3536, dtype: object

In [None]:
# 
# movies_dataset.loc[movies_dataset.index == 3536, :]

# movies_dataset[movies_dataset.index == 3536]
# type(movies_dataset.loc[movies_dataset.index == 3536, :])
# movies_dataset.loc[movies_dataset.index == 3536, :]


In [9]:
display(users_dataset.sample(5))
display(ratings_dataset.sample(5))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3818,M,45,17,32837
3780,M,1,0,46979
1085,M,25,7,34695
5525,F,1,10,55311
1772,M,56,2,90680


Unnamed: 0_level_0,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1757,2028,5,974708443
1061,596,5,974950038
2102,852,3,974654205
5791,1571,3,958091132
2248,1994,3,974595895


In [18]:
users_dataset.shape, ratings_dataset.shape # tamaño, numero de registros

((6040, 4), (1000209, 3))

In [19]:
# combinando users y ratings, ¿Cómo?. right, queremos que a la derecha este ratings_dataset
combined_dataset = users_dataset.merge(right=ratings_dataset, on='UserID', how='inner') # parametro 'on' define la columna pivote
display(combined_dataset.head(5))
print(len(combined_dataset))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,F,1,10,48067,1193,5,978300760
1,F,1,10,48067,661,3,978302109
1,F,1,10,48067,914,3,978301968
1,F,1,10,48067,3408,4,978300275
1,F,1,10,48067,2355,5,978824291


1000209


In [20]:
combined_dataset = users_dataset.merge(right=ratings_dataset, left_index=True, right_index=True, how='inner') # parametro 'on' define la columna pivote
display(combined_dataset.head(5))
print(len(combined_dataset))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,F,1,10,48067,1193,5,978300760
1,F,1,10,48067,661,3,978302109
1,F,1,10,48067,914,3,978301968
1,F,1,10,48067,3408,4,978300275
1,F,1,10,48067,2355,5,978824291


1000209


In [21]:
# combinando movies y el resto
all_dataset = combined_dataset.merge(movies_dataset, on='MovieID', how='inner')
display(all_dataset.head(5))

Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
0,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [22]:
all_dataset["Age"].count() # si sale el total de los registros significa que no ay nulos

1000209

### Concatenate
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

## Pivot
- Representar los datos en función a varios parámetros, agregando
```python
pivot_table(<lista de valores>, index=<agregador primario>, columns=<agregador secundario>)
```
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
- https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [23]:
# display(all_dataset.pivot_table('Rating', index='Gender', columns='Age'))
# display(all_dataset.pivot_table('Rating', index='Gender', columns='Age', aggfunc='count'))
display(all_dataset.pivot_table('Rating', index='Age', columns='Gender', aggfunc=['count', max,np.mean]))
# pivot_table similar a group by pero con mas caracteristicas
# dame por edad la media de cada campo

Unnamed: 0_level_0,count,count,max,max,mean,mean
Gender,F,M,F,M,F,M
Age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,8827,18384,5,5,3.616291,3.517461
18,45427,138109,5,5,3.453145,3.525476
25,91340,304216,5,5,3.6067,3.52678
35,49473,149530,5,5,3.659653,3.604434
45,24110,59523,5,5,3.663044,3.627942
50,18064,54426,5,5,3.79711,3.687098
56,9199,29581,5,5,3.915534,3.720327


## Agrupaciones
- agg -> funciones estadísticas de agregación
- Series.unique() -> valores únicos
- pd.value_counts -> ocurrencias

## Manipulación de strings
```python
split(): separar en bloques en función de un carácter
replace(): reemplazar un carácter por otro
index(): encontrar la posición de un carácter
```

In [24]:
# Ejemplo con MovieLens: Genre
## 1: obtener todos los géneros por separado
## 2: crear un dataset de géneros
## 3: por película, marcar género por separado
## 4: unir con dataset original
display(movies_dataset.head(3))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [25]:
all_genres = movies_dataset['Genre'].apply(lambda x : x.split('|')) # obtengo todos los generos de una columna

# print(all_genres)
# print([genre for x in all_genres for genre in x])
# genres = pd.unique([genre for movie in all_genres for genre in movie])
# la suma aplica a al dato tipo lista
genres = pd.unique(all_genres.sum())
print(genres)

['Animation' "Children's" 'Comedy' 'Adventure' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']


In [26]:
# crear tabla con columnas por género
zeros = np.zeros( (len(movies_dataset), len(genres)) )
genres_frame = pd.DataFrame(zeros, columns=genres)
display(genres_frame.head(3))

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Vamos a calcular un One Hot Encoding

In [29]:
%%time
# Otra forma de hacerlo con loc
for i, genre in enumerate(movies_dataset['Genre']):
    genres_frame.loc[i, genre.split('|')] = 1 # coloca 1 en las columas de generos si en la col genero se encuentra el genero separado por |
movies_split_genre = movies_dataset.join(genres_frame) # trae los nombres de las peliculas

CPU times: user 2.48 s, sys: 527 µs, total: 2.48 s
Wall time: 2.48 s


In [30]:
movies_split_genre

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,Meet the Parents (2000),Comedy,,,,,,,,,,,,,,,,,,
3949,Requiem for a Dream (2000),Drama,,,,,,,,,,,,,,,,,,
3950,Tigerland (2000),Drama,,,,,,,,,,,,,,,,,,
3951,Two Family House (2000),Drama,,,,,,,,,,,,,,,,,,


In [31]:
%%time
columns_genres = genres_frame.columns # lista de generos (columnas)
# para cada película, marcar género con 1
for i, genre in enumerate(movies_dataset['Genre']):
    inds = columns_genres.get_indexer(genre.split('|')) # retorna los indices correspondientes a los generos de cada pelicula
    genres_frame.iloc[i,inds] = 1 # localiza las columnas del genero correspondiente, marca con 1
# display(genres_frame.head(5))
movies_split_genre = movies_dataset.join(genres_frame)

CPU times: user 873 ms, sys: 0 ns, total: 873 ms
Wall time: 872 ms


In [32]:
%%time
movies_dataset["genero_ls"] = movies_dataset['Genre'].apply(lambda x : x.split('|'))
df_unstacked = movies_dataset.loc[:, ["genero_ls"]]\
                            .assign(gen=1)\
                            .explode("genero_ls")\
                            .reset_index(drop=False)\
                            .set_index(keys=["MovieID", "genero_ls"])\
                            .unstack("genero_ls", fill_value=0)\
                            .droplevel(0, axis=1)

pd.concat([movies_dataset.loc[:, ["Title"]], df_unstacked], axis=1).head()
df_unstacked
movies_dataset.loc[:, ["Title"]]

CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms
Wall time: 28.2 ms


Unnamed: 0_level_0,Title
MovieID,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)
...,...
3948,Meet the Parents (2000)
3949,Requiem for a Dream (2000)
3950,Tigerland (2000)
3951,Two Family House (2000)


In [34]:
# O utilizar ScikitLearn
# pip install -U scikit-learn
from sklearn.preprocessing import MultiLabelBinarizer

In [35]:
%%timeit
movies_dataset["genero_ls"] = movies_dataset['Genre'].apply(lambda x : x.split('|'))
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(movies_dataset["genero_ls"])
movies_dataset.join(pd.DataFrame(X, columns=mlb.classes_))

6.5 ms ± 343 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [36]:
display(movies_split_genre.head(5))

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Replace e index para extraer el año de la película

In [37]:
movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, # quito el año
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")
display(movies_dataset.sample(5))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1119,Drunks (1997),Drama
3551,Marathon Man (1976),Thriller
510,Poetic Justice (1993),Drama
1517,Austin Powers: International Man of Mystery (1...,Comedy
746,Force of Evil (1948),Film-Noir


In [38]:
display(movies_dataset.head(2))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy


In [39]:
# extraer el año de la columna Title
def split_year(title):
    index = title.index('(')
    return title[index:].replace('(','').replace(')','')
    
# crear nueva columna Year
movies_dataset['Year'] = movies_dataset['Title'].apply(split_year)
display(movies_dataset.sample(2))
movies_dataset["Year"].unique()

Unnamed: 0_level_0,Title,Genre,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2618,"Castle, The (1997)",Comedy,1997
1034,Freeway (1996),Crime,1996


array(['1995', 'Yao a yao yao dao waipo qiao 1995', 'Se7en 1995', '1994',
       'The Postman 1994', 'Le Confessionnal 1995', '1996',
       'Gazon maudit 1995', 'Badkonake Sefid  1995', 'Antonia 1995',
       'Haine, La 1995', 'Keiner liebt mich 1994', '1976', '1993', '1992',
       "Uomo delle stelle, L' 1995", 'Saimt el Qusur 1994', 'Cienie 1988',
       '1967', 'Parapluies de Cherbourg, Les 1964',
       'Utomlyonnye solntsem 1994', 'Pred dozhdot 1994', '1977',
       'Como agua para chocolate 1992', 'Mi vida loca 1993',
       'a.k.a. Leon: The Professional 1994', 'La Reine Margot 1994',
       'Pret-A-Porter 1994', 'Fresa y chocolate 1993', 'Huozhe 1994',
       'Die Macht der Bilder 1993', '1965', 'Ai no corrida 1976',
       'Nuits fauves, Les 1992', '1982', '1962', 'Bulletproof Heart 1994',
       'Kådisbellan  1993', 'Io speriamo che me la cavo  1993',
       'Café au Lait 1993', 'Caro Diario 1994', 'De eso no se habla 1993',
       '1990', '1991', '1989', '1937', '1940', '19

In [40]:
movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")
display(movies_dataset.sample(5))
# eliminar el año de la columna Title
def remove_year(title):
    index = title.index('(')
    return title[:index-1].strip()

movies_dataset['Title'] = movies_dataset['Title'].apply(remove_year)
display(movies_dataset.head(2))


Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
303,"Quick and the Dead, The (1995)",Action|Adventure|Western
1132,Manon of the Spring (Manon des sources) (1986),Drama
244,Gumby: The Movie (1995),Animation|Children's
2028,Saving Private Ryan (1998),Action|Drama|War
3261,Singles (1992),Comedy|Drama|Romance


Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,Animation|Children's|Comedy
2,Jumanji,Adventure|Children's|Fantasy


#### Expresiones regulares
https://docs.python.org/3/library/re.html

- import re

In [41]:
users_dataset.sample(10)

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1947,M,25,7,94041
3398,M,18,20,30033
2642,M,25,0,33073
2812,M,25,17,60607
5848,M,50,20,20009
3846,M,35,0,33884
56,M,35,20,60440
4843,M,56,1,4032
4603,F,35,2,33619
1798,M,45,11,22204


In [42]:
# ¿Cómo localizar que 'Zip-code' tiene un formato erróneo?
users_dataset.sample(5)

# users_dataset['Zip-code'].str.match('^[0-9]{5}$')
# capturo los valores enteros que empiezan en el string y se repiten 5 veces
display(users_dataset[users_dataset['Zip-code'].str.match('^\d{5}$') == False]) # 

# ^\d{5}$
# ^ = start of the string
# \d = decimal string
# {5} = 5 repeticiones de decimales
# $ = end of string

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
161,M,45,16,98107-2117
233,F,45,20,37919-4204
293,M,56,1,55337-4056
458,M,50,16,55405-2546
506,M,25,16,55103-1006
...,...,...,...,...
5682,M,18,0,23455-4959
5904,F,45,12,954025
5925,F,25,0,90035-4444
5967,M,50,16,73069-5429


In [43]:
movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")
display(movies_dataset.head(2))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy


In [75]:
# ¿Cómo extraer el año con regex en el formato adecuado?
display(movies_dataset['Title'].str.extract('(\d{4})'))
# capturo los años que tienen 4 digitos
# (\d{4})
# (= busca apertura parentesis
# \d = decimal string
# {4} = 4 repeticiones de decimales
# ) = cierre de parentesis

Unnamed: 0_level_0,0
MovieID,Unnamed: 1_level_1
1,1995
2,1995
3,1995
4,1995
5,1995
...,...
3948,2000
3949,2000
3950,2000
3951,2000


## Operaciones con colecciones
```python
reduce: aplicar una operación y retornar un valor
map: aplicar  una operación y retornar una secuencia
filter: retorna una secuencia con elementos que cumplen una condición
```


## Reduce
- Aplicar una operación matemática a cada uno de los elementos de una colección
- Diferente de 'apply()' porque retorna un valor numérico
- Ejemplo: Detección de géneros en años específicos

https://docs.python.org/3/library/functools.html

In [45]:
from functools import reduce # necesario para reduce

lista = [1, 3, 5, 7, 9]
print(reduce(lambda x,y: x + y, lista))

25


In [46]:
movies_1975 = movies_split_genre[ movies_split_genre['Title'].str.contains('1975') ]
movies_1975.head(3)

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
716,Switchblade Sisters (1975),Crime,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1007,"Apple Dumpling Gang, The (1975)",Children's|Comedy|Western,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1009,Escape to Witch Mountain (1975),Adventure|Children's|Fantasy,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [47]:
any_drama = reduce(lambda x,y : bool(x) | bool(y),movies_1975['Drama']) # hay algún drama en 1975
print(any_drama)

all_comedy = reduce(lambda x,y : bool(x) & bool(y),movies_1975['Comedy']) # son todas las películas de 1975 comedias?
print(all_comedy)

True
False


In [48]:
print(movies_1975['Drama'].any()) # Comprueba si hay algún valor que puede cumplir  
print(movies_1975['Comedy'].all()) # Comprueba si todos los valores son True

True
False


In [49]:
# Observar el tipo de dato antes para ver si es posible aplicar las funciones
print(movies_1975.dtypes)
print(movies_1975['Comedy'].unique())

Title           object
Genre           object
Animation      float64
Children's     float64
Comedy         float64
Adventure      float64
Fantasy        float64
Romance        float64
Drama          float64
Action         float64
Crime          float64
Thriller       float64
Horror         float64
Sci-Fi         float64
Documentary    float64
War            float64
Musical        float64
Mystery        float64
Film-Noir      float64
Western        float64
dtype: object
[1. 0.]


## Filter
- retorna una secuencia con elementos que cumplen una condición
- Ejemplo: obtener las películas de 1975 que contienen 'The' en el título

In [51]:
filtro = filter(lambda x : 'The' in x, movies_1975['Title'])
list(filtro)
# ¿Están todos los títulos con "The"? si tiene mayúsculas o no...

['Apple Dumpling Gang, The (1975)',
 'Man Who Would Be King, The (1975)',
 'Stepford Wives, The (1975)',
 'Rocky Horror Picture Show, The (1975)',
 'McCullochs, The (1975)',
 'Mirror, The (Zerkalo) (1975)']

In [52]:
filtro = filter(lambda x : 'the' in x, movies_1975['Title'].str.lower())
list(filtro)

['apple dumpling gang, the (1975)',
 "one flew over the cuckoo's nest (1975)",
 'man who would be king, the (1975)',
 'stepford wives, the (1975)',
 'rocky horror picture show, the (1975)',
 'three days of the condor (1975)',
 'brother, can you spare a dime? (1975)',
 'mccullochs, the (1975)',
 'mirror, the (zerkalo) (1975)']

## Map
- aplicar  una operación y retornar una secuencia
- Cambiar el valor integral de la columna 'Comedy' por bool

In [53]:
mapa = map(lambda x : bool(x), movies_split_genre['Comedy'])
movies_split_genre.loc[:,'Comedy'] = list(mapa)
display(movies_split_genre.head(4))

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,1.0,False,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,0.0,True,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,True,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transformación de variables (calidad de datos)
- Valores no definidos
- Valores duplicados
- Discretización (valores categóricos)

In [54]:
matrix = pd.DataFrame(np.random.randint(10,size=(5,10)))
matrix[matrix < 2] = np.nan
display(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,
4,3,5.0,3,3.0,,,6,,6.0,6.0


In [55]:
# nulos por columna
print(matrix.isnull().sum(axis=0))
display(matrix.isna().sum())

0    0
1    2
2    0
3    3
4    1
5    2
6    0
7    1
8    1
9    2
dtype: int64


0    0
1    2
2    0
3    3
4    1
5    2
6    0
7    1
8    1
9    2
dtype: int64

In [56]:
# Cantidad valores nulos
print(matrix.isnull().sum(axis=1).sum())

12


In [57]:
# numero de no nulos por fila
print(matrix.count(axis=1))

0    9
1    6
2    9
3    7
4    7
dtype: int64


In [58]:
# Número de nulos por fila
print(matrix.shape[1] - matrix.count(axis=1))

0    1
1    4
2    1
3    3
4    3
dtype: int64


In [59]:
# Representación de las filas en las que una determinada columna tiene nulos
display(matrix[matrix[1].isnull()])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,2,,8,,6.0,7.0,9,7.0,,
3,8,,5,,4.0,7.0,3,9.0,2.0,


In [60]:
# Conteo de valores que aparecen en el dataset
valores = [3, 9]
# Identificación de valores de dominio que se encuentran en un listado
display(matrix[matrix[6].isin(valores)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,


In [61]:
display(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,
4,3,5.0,3,3.0,,,6,,6.0,6.0


In [62]:
## Tratamiento de valores nulos
# eliminar
display(matrix.dropna(axis=1))

Unnamed: 0,0,2,6
0,5,7,8
1,2,8,9
2,3,7,9
3,8,5,3
4,3,3,6


In [63]:
# eliminar si no hay un número de valores no NaN
display(matrix)
display(matrix.dropna(thresh=8))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,
4,3,5.0,3,3.0,,,6,,6.0,6.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0


In [64]:
# sustituir por un valor fijo
display(matrix.fillna(-1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,-1.0,8,5.0,5.0,7.0
1,2,-1.0,8,-1.0,6.0,7.0,9,7.0,-1.0,-1.0
2,3,6.0,7,-1.0,2.0,8.0,9,5.0,8.0,3.0
3,8,-1.0,5,-1.0,4.0,7.0,3,9.0,2.0,-1.0
4,3,5.0,3,3.0,-1.0,-1.0,6,-1.0,6.0,6.0


In [65]:
# sustituir por valor dinámico (copia)
display(matrix)
display(matrix.fillna(method='ffill')) # bfill y ffill

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,
4,3,5.0,3,3.0,,,6,,6.0,6.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,7.0,8,3.0,6.0,7.0,9,7.0,5.0,7.0
2,3,6.0,7,3.0,2.0,8.0,9,5.0,8.0,3.0
3,8,6.0,5,3.0,4.0,7.0,3,9.0,2.0,3.0
4,3,5.0,3,3.0,4.0,7.0,6,9.0,6.0,6.0


In [66]:
# sustituir por valor dinámico (interpolación)
display(matrix)
display(matrix.interpolate())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,,8,,6.0,7.0,9,7.0,,
2,3,6.0,7,,2.0,8.0,9,5.0,8.0,3.0
3,8,,5,,4.0,7.0,3,9.0,2.0,
4,3,5.0,3,3.0,,,6,,6.0,6.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5,7.0,7,3.0,7.0,,8,5.0,5.0,7.0
1,2,6.5,8,3.0,6.0,7.0,9,7.0,6.5,5.0
2,3,6.0,7,3.0,2.0,8.0,9,5.0,8.0,3.0
3,8,5.5,5,3.0,4.0,7.0,3,9.0,2.0,4.5
4,3,5.0,3,3.0,4.0,7.0,6,9.0,6.0,6.0


#### Tratar valores duplicados

In [67]:
serie = pd.Series(['a','b','c','a','c','a','g'])
print(serie.duplicated())

0    False
1    False
2    False
3     True
4     True
5     True
6    False
dtype: bool


In [68]:
df = all_dataset
display(df.head(3))

# Eliminación de los duplicados en una columna definida
df2 = df.drop_duplicates(subset="Gender", keep='last', inplace=False)
display(df2)

Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
0,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama


Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
1000207,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
1000208,M,25,1,35401,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


#### Discretización (valores categóricos)
- Tras Series y DataFrame, objeto para categorías: Categorical
```python
categorias = pd.cut(<valores>, <bins>) 
```

In [69]:
# especificar los bloques
bins = [0,18,35,65,99]
edades = [16,25,18,71,44,100,12]
categorias = pd.cut(edades,bins)
print(categorias)

[(0.0, 18.0], (18.0, 35.0], (0.0, 18.0], (65.0, 99.0], (35.0, 65.0], NaN, (0.0, 18.0]]
Categories (4, interval[int64, right]): [(0, 18] < (18, 35] < (35, 65] < (65, 99]]


In [70]:
categorias.value_counts()

(0, 18]     3
(18, 35]    1
(35, 65]    1
(65, 99]    1
dtype: int64

In [71]:
# especificar el número de bloques
bins = 5
edades = [0,6,8,16,25,18,71,44,100]
categorias = pd.cut(edades,bins) # rangos idénticos (similar distancia de rangos)
print(categorias)
print(categorias.value_counts())

[(-0.1, 20.0], (-0.1, 20.0], (-0.1, 20.0], (-0.1, 20.0], (20.0, 40.0], (-0.1, 20.0], (60.0, 80.0], (40.0, 60.0], (80.0, 100.0]]
Categories (5, interval[float64, right]): [(-0.1, 20.0] < (20.0, 40.0] < (40.0, 60.0] < (60.0, 80.0] < (80.0, 100.0]]
(-0.1, 20.0]     5
(20.0, 40.0]     1
(40.0, 60.0]     1
(60.0, 80.0]     1
(80.0, 100.0]    1
dtype: int64


In [72]:
bins = 5
edades = [1,6,8,16,25,18,71,44,100]
categorias = pd.qcut(edades,bins) # rangos homogéneos (similar número de valores)
print(categorias)
print(categorias.value_counts())

[(0.999, 7.2], (0.999, 7.2], (7.2, 16.4], (7.2, 16.4], (23.6, 54.8], (16.4, 23.6], (54.8, 100.0], (23.6, 54.8], (54.8, 100.0]]
Categories (5, interval[float64, right]): [(0.999, 7.2] < (7.2, 16.4] < (16.4, 23.6] < (23.6, 54.8] < (54.8, 100.0]]
(0.999, 7.2]     2
(7.2, 16.4]      2
(16.4, 23.6]     1
(23.6, 54.8]     2
(54.8, 100.0]    2
dtype: int64


## Visualización

In [73]:
movies_dataset.assign(tip_rate=tips["tip"] / tips["total_bill"]) #TODO: columna de asignación al vuelo

NameError: name 'tips' is not defined