
<img src="img/viu_logo.png" width="200">

## 01MIAR - Procesamiento de Datos

![logo](img/python_logo.png)

*Ivan Fuertes*

In [1]:
import numpy as np
import pandas as pd

### Uniendo datasets con 'join' y 'merge'
- merge() == join()
 - 'join' utiliza por defecto los índices para unir
- Utilizando el parámetro 'on'
 - Si las columnas difieren, 'left_on' y 'right_on'
 
 https://i.stack.imgur.com/hMKKt.jpg

### Combinar varios datasets 
- En base a un elemento en común (índice)
- MovieLens 'UserId'

In [2]:
import zipfile as zp # para descomprimir archivos zip
import urllib.request # para descargar de URL
import os

# descargar MovieLens dataset
url = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'  
local_zip = os.path.join("res", "ml-1m.zip")
urllib.request.urlretrieve(url, local_zip)
# descomprimiendo archivo zip
with zp.ZipFile(local_zip, 'r') as zipp: 
    print('Extracting all files...') 
    zipp.extractall(os.path.join("res")) # destino
    print('Done!') 

Extracting all files...
Done!


In [3]:
ruta_users = os.path.join("res", "ml-1m", "users.dat")
ruta_ratings = os.path.join("res", "ml-1m", "ratings.dat")
ruta_movies = os.path.join("res", "ml-1m", "movies.dat")

users_dataset = pd.read_csv(ruta_users, sep='::', index_col=0,
    header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], engine='python', encoding="ISO-8859-1")

ratings_dataset = pd.read_csv(ruta_ratings, sep='::', index_col=0, 
    header=None, names=['UserID','MovieID','Rating','Timestamp'], engine='python', encoding="ISO-8859-1")

movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")

In [4]:
display(users_dataset.sample(5))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2610,M,45,7,48322
2696,M,25,7,24210
885,M,35,0,48105
4300,F,18,4,98125
6040,M,25,6,11106


In [5]:
display(ratings_dataset.sample(5))

Unnamed: 0_level_0,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2358,2762,4,974384387
1101,3503,2,980367762
679,3702,4,975608529
3017,1225,4,972687489
4222,2021,5,965314515


In [6]:
display(movies_dataset.sample(5))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1215,Army of Darkness (1993),Action|Adventure|Comedy|Horror|Sci-Fi
2183,"Man Who Knew Too Much, The (1956)",Thriller
283,New Jersey Drive (1995),Crime|Drama
3807,Sinbad and the Eye of the Tiger (1977),Action|Adventure
240,Hideaway (1995),Thriller


In [7]:
# combinando users y ratings, ¿Cómo?
combined_dataset = users_dataset.merge(ratings_dataset, on='UserID', how='inner') # parametro 'on' define la columna pivote
display(combined_dataset.head(5))
print(len(combined_dataset))

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,F,1,10,48067,1193,5,978300760
1,F,1,10,48067,661,3,978302109
1,F,1,10,48067,914,3,978301968
1,F,1,10,48067,3408,4,978300275
1,F,1,10,48067,2355,5,978824291


1000209


In [8]:
# combinando movies y el resto
all_dataset = combined_dataset.merge(movies_dataset, on='MovieID', how='inner')
display(all_dataset.head(5))

Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
0,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


### Concatenate
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html

## Pivot
- Representar los datos en función a varios parámetros, agregando
```python
pivot_table(<lista de valores>, index=<agregador primario>, columns=<agregador secundario>)
```
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
- https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [9]:
# display(all_dataset.pivot_table('Rating', index='Gender', columns='Age'))
# display(all_dataset.pivot_table('Rating', index='Gender', columns='Age', aggfunc='count'))
display(all_dataset.pivot_table('Rating', index='Gender', columns='Age', aggfunc=['count', max,np.mean]))

Unnamed: 0_level_0,count,count,count,count,count,count,count,max,max,max,max,max,max,max,mean,mean,mean,mean,mean,mean,mean
Age,1,18,25,35,45,50,56,1,18,25,...,45,50,56,1,18,25,35,45,50,56
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
F,8827,45427,91340,49473,24110,18064,9199,5,5,5,...,5,5,5,3.616291,3.453145,3.6067,3.659653,3.663044,3.79711,3.915534
M,18384,138109,304216,149530,59523,54426,29581,5,5,5,...,5,5,5,3.517461,3.525476,3.52678,3.604434,3.627942,3.687098,3.720327


## Agrupaciones
- agg -> funciones estadísticas de agregación
- Series.unique() -> valores únicos
- pd.value_counts -> ocurrencias

## Manipulación de strings
```python
split(): separar en bloques en función de un carácter
replace(): reemplazar un carácter por otro
index(): encontrar la posición de un carácter
```

In [10]:
# Ejemplo con MovieLens: Genre
## 1: obtener todos los géneros por separado
## 2: crear un dataset de géneros
## 3: por película, marcar género por separado
## 4: unir con dataset original
display(movies_dataset.head(3))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [11]:
all_genres = movies_dataset['Genre'].apply(lambda x : x.split('|'))

# print(all_genres)
# print([genre for x in all_genres for genre in x])
# genres = pd.unique([genre for movie in all_genres for genre in movie])

genres = pd.unique(all_genres.sum())
print(genres)

['Animation' "Children's" 'Comedy' 'Adventure' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']


In [12]:
# crear tabla con columnas por género
zeros = np.zeros( (len(movies_dataset), len(genres)) )
genres_frame = pd.DataFrame(zeros, columns=genres)
display(genres_frame.head(3))

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
columns_genres = genres_frame.columns # lista de generos (columnas)
# para cada película, marcar género con 1
for i, genre in enumerate(movies_dataset['Genre']):
    inds = columns_genres.get_indexer(genre.split('|')) # retorna los indices correspondientes a los generos de cada pelicula
    genres_frame.iloc[i,inds] = 1 # localiza las columnas del genero correspondiente, marca con 1

In [14]:
display(genres_frame.head(5))

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
for i, genre in enumerate(movies_dataset['Genre']):
    genres_frame.loc[i, genre.split('|')] = 1

In [16]:
# unir con dataset original
movies_split_genre = movies_dataset.join(genres_frame)

In [17]:
display(movies_split_genre.head(5))

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Replace e index para extraer el año de la película

In [18]:
movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")
display(movies_dataset.sample(5))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
3328,Ghost Dog: The Way of the Samurai (1999),Crime|Drama
434,Cliffhanger (1993),Action|Adventure|Crime
590,Dances with Wolves (1990),Adventure|Drama|Western
1635,"Ice Storm, The (1997)",Drama
734,Getting Away With Murder (1996),Comedy


In [19]:
display(movies_dataset.head(2))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy


In [20]:
# extraer el año de la columna Title
def split_year(title):
    index = title.index('(')
    return title[index:].replace('(','').replace(')','')
    
# crear nueva columna Year
movies_dataset['Year'] = movies_dataset['Title'].apply(split_year)
display(movies_dataset.sample(2))

Unnamed: 0_level_0,Title,Genre,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
377,Speed (1994),Action|Romance|Thriller,1994
2756,Wanted: Dead or Alive (1987),Action,1987


In [21]:
# eliminar el año de la columna Title
def remove_year(title):
    index = title.index('(')
    return title[:index-1].strip()

movies_dataset['Title'] = movies_dataset['Title'].apply(remove_year)
display(movies_dataset.head(2))

Unnamed: 0_level_0,Title,Genre,Year
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Animation|Children's|Comedy,1995
2,Jumanji,Adventure|Children's|Fantasy,1995


#### Expresiones regulares
https://docs.python.org/3/library/re.html

- import re

In [22]:
# ¿Cómo localizar que 'Zip-code' tiene un formato erróneo?
users_dataset.sample(5)

# users_dataset['Zip-code'].str.match('^[0-9]{5}$')

display(users_dataset[users_dataset['Zip-code'].str.match('^\d{5}$') == False])

# ^\d{5}$
# ^ = start of the string
# \d = decimal string
# {5} = 5 repeticiones de decimales
# $ = end of string

Unnamed: 0_level_0,Gender,Age,Occupation,Zip-code
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
161,M,45,16,98107-2117
233,F,45,20,37919-4204
293,M,56,1,55337-4056
458,M,50,16,55405-2546
506,M,25,16,55103-1006
...,...,...,...,...
5682,M,18,0,23455-4959
5904,F,45,12,954025
5925,F,25,0,90035-4444
5967,M,50,16,73069-5429


In [23]:
movies_dataset = pd.read_csv(ruta_movies, sep='::', index_col=0, 
    header=None, names=['MovieID','Title','Genre'], engine='python', encoding="ISO-8859-1")
display(movies_dataset.head(2))

Unnamed: 0_level_0,Title,Genre
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy


In [24]:
# ¿Cómo extraer el año con regex en el formato adecuado?
display(movies_dataset['Title'].str.extract('(\d{4})'))

# (\d{4})
# (= busca apertura parentesis
# \d = decimal string
# {4} = 4 repeticiones de decimales
# ) = cierre de parentesis

Unnamed: 0_level_0,0
MovieID,Unnamed: 1_level_1
1,1995
2,1995
3,1995
4,1995
5,1995
...,...
3948,2000
3949,2000
3950,2000
3951,2000


## Operaciones con colecciones
```python
reduce: aplicar una operación y retornar un valor
map: aplicar  una operación y retornar una secuencia
filter: retorna una secuencia con elementos que cumplen una condición
```


## Reduce
- Aplicar una operación matemática a cada uno de los elementos de una colección
- Diferente de 'apply()' porque retorna un valor numérico
- Ejemplo: Detección de géneros en años específicos

https://docs.python.org/3/library/functools.html

In [25]:
from functools import reduce # necesario para reduce

lista = [1, 3, 5, 7, 9]
print(reduce(lambda x,y: x + y, lista))

25


In [26]:
movies_1975 = movies_split_genre[ movies_split_genre['Title'].str.contains('1975') ]
movies_1975.head(3)

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
716,Switchblade Sisters (1975),Crime,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1007,"Apple Dumpling Gang, The (1975)",Children's|Comedy|Western,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1009,Escape to Witch Mountain (1975),Adventure|Children's|Fantasy,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [27]:
any_drama = reduce(lambda x,y : bool(x) | bool(y),movies_1975['Drama']) # hay algún drama en 1975
print(any_drama)

all_comedy = reduce(lambda x,y : bool(x) & bool(y),movies_1975['Comedy']) # son todas las películas de 1975 comedias?
print(all_comedy)

True
False


In [28]:
print(movies_1975['Drama'].any()) # Comprueba si hay algún valor que puede cumplir  
print(movies_1975['Comedy'].all()) # Comprueba si todos los valores son True

True
False


In [29]:
# Observar el tipo de dato antes para ver si es posible aplicar las funciones
print(movies_1975.dtypes)
print(movies_1975['Comedy'].unique())

Title           object
Genre           object
Animation      float64
Children's     float64
Comedy         float64
Adventure      float64
Fantasy        float64
Romance        float64
Drama          float64
Action         float64
Crime          float64
Thriller       float64
Horror         float64
Sci-Fi         float64
Documentary    float64
War            float64
Musical        float64
Mystery        float64
Film-Noir      float64
Western        float64
dtype: object
[1. 0.]


## Filter
- retorna una secuencia con elementos que cumplen una condición
- Ejemplo: obtener las películas de 1975 que contienen 'The' en el título

In [30]:
filtro = filter(lambda x : 'The' in x, movies_1975['Title'])
list(filtro)
# ¿Están todos los títulos con "The"? si tiene mayúsculas o no...

['Apple Dumpling Gang, The (1975)',
 'Man Who Would Be King, The (1975)',
 'Stepford Wives, The (1975)',
 'Rocky Horror Picture Show, The (1975)',
 'McCullochs, The (1975)',
 'Mirror, The (Zerkalo) (1975)']

In [31]:
filtro = filter(lambda x : 'the' in x, movies_1975['Title'].str.lower())
list(filtro)

['apple dumpling gang, the (1975)',
 "one flew over the cuckoo's nest (1975)",
 'man who would be king, the (1975)',
 'stepford wives, the (1975)',
 'rocky horror picture show, the (1975)',
 'three days of the condor (1975)',
 'brother, can you spare a dime? (1975)',
 'mccullochs, the (1975)',
 'mirror, the (zerkalo) (1975)']

## Map
- aplicar  una operación y retornar una secuencia
- Cambiar el valor integral de la columna 'Comedy' por bool

In [32]:
mapa = map(lambda x : bool(x), movies_split_genre['Comedy'])
movies_split_genre.loc[:,'Comedy'] = list(mapa)
display(movies_split_genre.head(4))

Unnamed: 0_level_0,Title,Genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0.0,1.0,False,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,0.0,True,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,True,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Transformación de variables (calidad de datos)
- Valores no definidos
- Valores duplicados
- Discretización (valores categóricos)

In [33]:
matrix = pd.DataFrame(np.random.randint(10,size=(5,10)))
matrix[matrix < 2] = np.nan
display(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,,
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


In [34]:
# nulos por columna
print(matrix.isnull().sum(axis=0))
display(matrix.isna().sum())

0    2
1    0
2    0
3    1
4    1
5    1
6    0
7    3
8    1
9    1
dtype: int64


0    2
1    0
2    0
3    1
4    1
5    1
6    0
7    3
8    1
9    1
dtype: int64

In [35]:
# Cantidad valores nulos
print(matrix.isnull().sum(axis=1).sum())

10


In [36]:
# numero de no nulos por fila
print(matrix.count(axis=1))

0     6
1     7
2     9
3     8
4    10
dtype: int64


In [37]:
# Número de nulos por fila
print(matrix.shape[1] - matrix.count(axis=1))

0    4
1    3
2    1
3    2
4    0
dtype: int64


In [38]:
# Representación de las filas en las que una determinada columna tiene nulos
display(matrix[matrix[1].isnull()])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9


In [39]:
# Conteo de valores que aparecen en el dataset
valores = [3, 9]
# Identificación de valores de dominio que se encuentran en un listado
display(matrix[matrix[6].isin(valores)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0


In [40]:
display(matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,,
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


In [41]:
## Tratamiento de valores nulos
# eliminar
display(matrix.dropna(axis=1))

Unnamed: 0,1,2,6
0,2,9,9
1,3,2,2
2,8,4,5
3,9,6,5
4,6,7,7


In [42]:
# eliminar si no hay un número de valores no NaN
display(matrix)
display(matrix.dropna(thresh=8))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,,
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


In [43]:
# sustituir por un valor fijo
display(matrix.fillna(-1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.0,2,9,-1.0,6.0,-1.0,9,-1.0,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,-1.0,-1.0,-1.0
2,9.0,8,4,9.0,8.0,9.0,5,-1.0,7.0,3.0
3,-1.0,9,6,3.0,-1.0,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


In [44]:
# sustituir por valor dinámico (copia)
display(matrix)
display(matrix.fillna(method='ffill')) # bfill y ffill

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,,
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,8.0,3.0
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,9.0,9,6,3.0,8.0,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


In [45]:
# sustituir por valor dinámico (interpolación)
display(matrix)
display(matrix.interpolate())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,,
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,,9,6,3.0,,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,2,9,,6.0,,9,,8.0,3.0
1,7.0,3,2,7.0,5.0,5.0,2,,7.5,3.0
2,9.0,8,4,9.0,8.0,9.0,5,,7.0,3.0
3,6.5,9,6,3.0,7.0,4.0,5,6.0,2.0,6.0
4,4.0,6,7,5.0,6.0,9.0,7,7.0,3.0,3.0


#### Tratar valores duplicados

In [46]:
serie = pd.Series(['a','b','c','a','c','a','g'])
print(serie.duplicated())

0    False
1    False
2    False
3     True
4     True
5     True
6    False
dtype: bool


In [47]:
df = all_dataset
display(df.head(3))

# Eliminación de los duplicados en una columna definida
df2 = df.drop_duplicates(subset="Gender", keep='last', inplace=False)
display(df2)

Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
0,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama


Unnamed: 0,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genre
1000207,F,18,20,55410,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western
1000208,M,25,1,35401,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)",Documentary


#### Discretización (valores categóricos)
- Tras Series y DataFrame, objeto para categorías: Categorical
```python
categorias = pd.cut(<valores>, <bins>) 
```

In [48]:
# especificar los bloques
bins = [0,18,35,65,99]
edades = [16,25,18,71,44,100,12]
categorias = pd.cut(edades,bins)
print(categorias)

[(0.0, 18.0], (18.0, 35.0], (0.0, 18.0], (65.0, 99.0], (35.0, 65.0], NaN, (0.0, 18.0]]
Categories (4, interval[int64, right]): [(0, 18] < (18, 35] < (35, 65] < (65, 99]]


In [49]:
categorias.value_counts()

(0, 18]     3
(18, 35]    1
(35, 65]    1
(65, 99]    1
dtype: int64

In [50]:
# especificar el número de bloques
bins = 5
edades = [0,6,8,16,25,18,71,44,100]
categorias = pd.cut(edades,bins) # rangos idénticos (similar distancia de rangos)
print(categorias)
print(categorias.value_counts())

[(-0.1, 20.0], (-0.1, 20.0], (-0.1, 20.0], (-0.1, 20.0], (20.0, 40.0], (-0.1, 20.0], (60.0, 80.0], (40.0, 60.0], (80.0, 100.0]]
Categories (5, interval[float64, right]): [(-0.1, 20.0] < (20.0, 40.0] < (40.0, 60.0] < (60.0, 80.0] < (80.0, 100.0]]
(-0.1, 20.0]     5
(20.0, 40.0]     1
(40.0, 60.0]     1
(60.0, 80.0]     1
(80.0, 100.0]    1
dtype: int64


In [51]:
bins = 5
edades = [1,6,8,16,25,18,71,44,100]
categorias = pd.qcut(edades,bins) # rangos homogéneos (similar número de valores)
print(categorias)
print(categorias.value_counts())

[(0.999, 7.2], (0.999, 7.2], (7.2, 16.4], (7.2, 16.4], (23.6, 54.8], (16.4, 23.6], (54.8, 100.0], (23.6, 54.8], (54.8, 100.0]]
Categories (5, interval[float64, right]): [(0.999, 7.2] < (7.2, 16.4] < (16.4, 23.6] < (23.6, 54.8] < (54.8, 100.0]]
(0.999, 7.2]     2
(7.2, 16.4]      2
(16.4, 23.6]     1
(23.6, 54.8]     2
(54.8, 100.0]    2
dtype: int64
