# Introducción a Pandas

Aprenderemos a utilizar pandas para el análisis de datos:

* Introducción a Pandas
* Series
* DataFrames
* Valores perdidos

___

In [None]:
!pip install numpy


In [None]:
!pip install pandas

In [None]:
import numpy as np


In [5]:
import pandas as pd

# Series

### Creando Series

Conversión de una lista, Array Numpy o diccionario a Series.

Primero, creamos las variables:

In [7]:
labels = ['a','b','c'] 
print (labels, type(labels))


['a', 'b', 'c'] <class 'list'>


In [8]:

my_list = [10,20,30]
print (my_list, type(my_list))


[10, 20, 30] <class 'list'>


In [9]:

arr = np.array((10,20,30))
print (arr, type(arr))



[10 20 30] <class 'numpy.ndarray'>


In [11]:
d = {'a':10,'b':20,'c':30,'d':45}
print (d, type(d))

{'a': 10, 'b': 20, 'c': 30, 'd': 45} <class 'dict'>


### Usando listas

In [12]:
serie_de_lista = pd.Series(data=my_list)
print(serie_de_lista, type(serie_de_lista))

0    10
1    20
2    30
dtype: int64 <class 'pandas.core.series.Series'>


In [13]:
serie_de_lista[1]

20

In [None]:
# Asignamos etiquetas a la serie
serie_de_lista = pd.Series(data=my_list, index=labels)
print(serie_de_lista, type(serie_de_lista))

In [None]:
serie_de_lista["b"]

Union de dos series

    Los parámetros data e index se pueden introducir por posición

In [14]:
serie_de_lista = pd.Series(my_list, labels)
print(serie_de_lista, type(serie_de_lista))

a    10
b    20
c    30
dtype: int64 <class 'pandas.core.series.Series'>


### Usando Arrays

unido con una lista


In [15]:
serie_de_array = pd.Series(arr, labels)
print(serie_de_array, type(serie_de_array))

a    10
b    20
c    30
dtype: int32 <class 'pandas.core.series.Series'>


### Usando Diccionarios

In [19]:
print("que es d: ", d)
print()
serie_de_diccionario = pd.Series(d)
print(serie_de_diccionario, type(serie_de_diccionario))

que es d:  {'a': 10, 'b': 20, 'c': 30, 'd': 45}

a    10
b    20
c    30
d    45
dtype: int64 <class 'pandas.core.series.Series'>


### Usando índices

La clave para usar Series, es entender sus índices. Pandas puede usar índices en formato numéricos o en formato texto.

In [26]:
ser1 = pd.Series(np.random.rand(4), index = ['EEUU', 'Alemania','Rusia', 'Japón'])


In [27]:
ser2 = pd.Series(np.random.rand(4), index = ['EEUU', 'Alemania','Italia', 'Japón'])


In [28]:
print (ser1)
print ()
print (ser2)

EEUU        0.573548
Alemania    0.644498
Rusia       0.932206
Japón       0.573082
dtype: float64

EEUU        0.560594
Alemania    0.507787
Italia      0.796570
Japón       0.858913
dtype: float64


En los notebooks puedo ver el contenido de un objeto de pandas con el método display.

In [29]:
display(ser1)


EEUU        0.573548
Alemania    0.644498
Rusia       0.932206
Japón       0.573082
dtype: float64

In [30]:
display(ser2)

EEUU        0.560594
Alemania    0.507787
Italia      0.796570
Japón       0.858913
dtype: float64

Las series de Python usan la sintaxis de los diccionarios para acceder a los valores.

Pero ahora, la clave de acceso puede ser una lista de índices en lugar de sólo una clave de diccionario.

In [31]:
print(ser1['EEUU'])


0.5735482743726403


In [32]:
print("Como serie de índices")
print(ser1[['EEUU']])


Como serie de índices
EEUU    0.573548
dtype: float64


In [33]:

print(ser1[['Rusia','EEUU']])


Rusia    0.932206
EEUU     0.573548
dtype: float64


In [34]:

print(ser1[['Rusia','EEUU', 'Japón']])

Rusia    0.932206
EEUU     0.573548
Japón    0.573082
dtype: float64


## Operaciones con series

In [35]:
ser1

EEUU        0.573548
Alemania    0.644498
Rusia       0.932206
Japón       0.573082
dtype: float64

In [36]:
ser2

EEUU        0.560594
Alemania    0.507787
Italia      0.796570
Japón       0.858913
dtype: float64

Las operaciones se realizan en función del índice

In [37]:
suma = ser1 + ser2

In [38]:
suma

Alemania    1.152285
EEUU        1.134142
Italia           NaN
Japón       1.431994
Rusia            NaN
dtype: float64

In [39]:
ser1 * ser2

Alemania    0.327268
EEUU        0.321528
Italia           NaN
Japón       0.492227
Rusia            NaN
dtype: float64

In [41]:
type(ser1 + ser2)

pandas.core.series.Series

# DataFrames

Podemos ver un DataFrame como un conjunto de objetos `Series` unidos.

In [42]:
import pandas as pd
import numpy as np

In [43]:

from numpy.random import randn
np.random.seed(123)

In [44]:
indice = 'A B C D E'.split()


In [45]:
columnas = 'W X Y Z'.split()


In [46]:
print(indice, columnas)

['A', 'B', 'C', 'D', 'E'] ['W', 'X', 'Y', 'Z']


In [85]:
df = pd.DataFrame(randn(5,4),index=indice, columns=columnas)


In [51]:
df

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912
C,0.978736,2.238143,-1.294085,-1.038788
D,1.743712,-0.798063,0.029683,1.069316
E,0.890706,1.754886,1.495644,1.069393


In [54]:
df1 = pd.DataFrame(randn(5,4),index=columnas, columns=indice)

ValueError: Shape of passed values is (5, 4), indices imply (4, 5)

In [55]:
df1

Unnamed: 0,W,X,Y,Z
A,-0.772709,0.794863,0.314272,-1.326265
B,1.417299,0.807237,0.04549,-0.233092
C,-1.198301,0.199524,0.468439,-0.831155
D,1.162204,-1.097203,-2.1231,1.039727
E,-0.403366,-0.12603,-0.837517,-1.605963


## Selección e indexación


### Selección de columnas

In [56]:
df['W']

A   -0.805367
B    0.338589
C    0.978736
D    1.743712
E    0.890706
Name: W, dtype: float64

In [57]:
type(df)

pandas.core.frame.DataFrame

Cuando extraigo una única columna, o fila, obtengo un objeto `Series`

In [58]:
type(df['W'])

pandas.core.series.Series

Como vemos la columna W es simplemente una Serie

Pero si selecciono dos, sigue devolviendo una tabla, es decir, un DataFrame

In [59]:
df

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912
C,0.978736,2.238143,-1.294085,-1.038788
D,1.743712,-0.798063,0.029683,1.069316
E,0.890706,1.754886,1.495644,1.069393


In [60]:
# Selección de varias columnas por su nombre
lista_col = ["Z", "W"]
df[lista_col]

Unnamed: 0,Z,W
A,0.573806,-0.805367
B,0.412912,0.338589
C,-1.038788,0.978736
D,1.069316,1.743712
E,1.069393,0.890706


Pandas también permite sintaxis tipo SQL, no obstante no se recomienda su uso.

In [61]:
df.W

A   -0.805367
B    0.338589
C    0.978736
D    1.743712
E    0.890706
Name: W, dtype: float64

### Selección de filas   .loc

In [62]:
# Para selección filas con etiquetas
df.loc['B']

W    0.338589
X   -0.011830
Y    2.392365
Z    0.412912
Name: B, dtype: float64

In [63]:
type(df.loc['B'])

pandas.core.series.Series

### Selección de filas   .iloc

In [64]:
# Para selección filas por índices
df.iloc[3]

W    1.743712
X   -0.798063
Y    0.029683
Z    1.069316
Name: D, dtype: float64

### Selección de un subset de datos

In [71]:
df

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912
C,0.978736,2.238143,-1.294085,-1.038788
D,1.743712,-0.798063,0.029683,1.069316
E,0.890706,1.754886,1.495644,1.069393


In [65]:
df.loc[['A','C'],['W','Y']]


Unnamed: 0,W,Y
A,-0.805367,-0.3909
C,0.978736,-1.294085


In [68]:
# Selección de 2 filas y todas las columnas.
df.loc[['A','B']]

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912


In [69]:
df.iloc[:2][:]

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912


In [72]:
df.iloc[:2,1:]

Unnamed: 0,X,Y,Z
A,-1.727669,-0.3909,0.573806
B,-0.01183,2.392365,0.412912


In [73]:
df.iloc[0:2,:]

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912


## Creando una nueva columna

In [74]:
df['nueva'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,nueva
A,-0.805367,-1.727669,-0.3909,0.573806,-1.196266
B,0.338589,-0.01183,2.392365,0.412912,2.730954
C,0.978736,2.238143,-1.294085,-1.038788,-0.315349
D,1.743712,-0.798063,0.029683,1.069316,1.773395
E,0.890706,1.754886,1.495644,1.069393,2.386351


In [75]:
df['clase'] = 0
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-0.805367,-1.727669,-0.3909,0.573806,-1.196266,0
B,0.338589,-0.01183,2.392365,0.412912,2.730954,0
C,0.978736,2.238143,-1.294085,-1.038788,-0.315349,0
D,1.743712,-0.798063,0.029683,1.069316,1.773395,0
E,0.890706,1.754886,1.495644,1.069393,2.386351,0


## Eliminar columnas

In [76]:
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-0.805367,-1.727669,-0.3909,0.573806,-1.196266,0
B,0.338589,-0.01183,2.392365,0.412912,2.730954,0
C,0.978736,2.238143,-1.294085,-1.038788,-0.315349,0
D,1.743712,-0.798063,0.029683,1.069316,1.773395,0
E,0.890706,1.754886,1.495644,1.069393,2.386351,0


In [77]:
df.drop(columns="nueva")

Unnamed: 0,W,X,Y,Z,clase
A,-0.805367,-1.727669,-0.3909,0.573806,0
B,0.338589,-0.01183,2.392365,0.412912,0
C,0.978736,2.238143,-1.294085,-1.038788,0
D,1.743712,-0.798063,0.029683,1.069316,0
E,0.890706,1.754886,1.495644,1.069393,0


In [78]:
df.drop('nueva', axis=1)

Unnamed: 0,W,X,Y,Z,clase
A,-0.805367,-1.727669,-0.3909,0.573806,0
B,0.338589,-0.01183,2.392365,0.412912,0
C,0.978736,2.238143,-1.294085,-1.038788,0
D,1.743712,-0.798063,0.029683,1.069316,0
E,0.890706,1.754886,1.495644,1.069393,0


Pero el dataframe continúa íntegro

In [79]:
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-0.805367,-1.727669,-0.3909,0.573806,-1.196266,0
B,0.338589,-0.01183,2.392365,0.412912,2.730954,0
C,0.978736,2.238143,-1.294085,-1.038788,-0.315349,0
D,1.743712,-0.798063,0.029683,1.069316,1.773395,0
E,0.890706,1.754886,1.495644,1.069393,2.386351,0


Hay 2 maneras diferentes de borrar una columna y que el cambio se guarde en el DataFrame original
```
df = df.drop('nombre_columna',axis=1)
```
o bien:
```
df.drop('nombre_columna',axis=1, inplace=True)
```


In [80]:
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-0.805367,-1.727669,-0.3909,0.573806,-1.196266,0
B,0.338589,-0.01183,2.392365,0.412912,2.730954,0
C,0.978736,2.238143,-1.294085,-1.038788,-0.315349,0
D,1.743712,-0.798063,0.029683,1.069316,1.773395,0
E,0.890706,1.754886,1.495644,1.069393,2.386351,0


In [81]:
df.drop('nueva',axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z,clase
A,-0.805367,-1.727669,-0.3909,0.573806,0
B,0.338589,-0.01183,2.392365,0.412912,0
C,0.978736,2.238143,-1.294085,-1.038788,0
D,1.743712,-0.798063,0.029683,1.069316,0
E,0.890706,1.754886,1.495644,1.069393,0


In [82]:
df = df.drop('clase', axis=1)
df

Unnamed: 0,W,X,Y,Z
A,-0.805367,-1.727669,-0.3909,0.573806
B,0.338589,-0.01183,2.392365,0.412912
C,0.978736,2.238143,-1.294085,-1.038788
D,1.743712,-0.798063,0.029683,1.069316
E,0.890706,1.754886,1.495644,1.069393


Si intento borrar una columna que ya no existe obtengo un error

In [None]:
del df["clase"]

In [None]:
df

## Eliminar filas

In [86]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
C,0.426147,-1.60541,-0.42768,1.24287
D,-0.735217,0.501249,1.012739,0.278741
E,-1.370948,-0.332475,1.959411,-2.025046


In [87]:
df.drop('D')

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
C,0.426147,-1.60541,-0.42768,1.24287
E,-1.370948,-0.332475,1.959411,-2.025046


A menos que lo especifiquemos con inplace, no se elimina nada

In [88]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
C,0.426147,-1.60541,-0.42768,1.24287
D,-0.735217,0.501249,1.012739,0.278741
E,-1.370948,-0.332475,1.959411,-2.025046


Hay 2 maneras diferentes de borrar una fila y que el cambio se guarde en el DataFrame original
```
df = df.drop('D',axis=0)
```
o bien:
```
df.drop('D',axis=0, inplace=True)
```





**Eliminar filas**

In [89]:
df.drop('E',axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
C,0.426147,-1.60541,-0.42768,1.24287
D,-0.735217,0.501249,1.012739,0.278741


In [90]:
df.drop(index="C", inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


## saber el n de filas y n de colmnas usas = shape

In [91]:
df.shape

(3, 4)

Número de filas

In [92]:
df.shape[0]

3

Número de columnas

In [93]:
df.shape[1]

4

## Filtros

### Selección condicional


Una importante característica de Pandas es la selección condicional de manera muy similar a Numpy:

In [94]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [95]:
df>0

Unnamed: 0,W,X,Y,Z
A,True,False,True,True
B,False,True,True,False
D,False,True,True,True


Este dataframe booleano funciona como un filtro

In [96]:
guardable = df[df>0]


In [97]:
guardable

Unnamed: 0,W,X,Y,Z
A,0.642055,,0.712265,2.598304
B,,0.034142,0.179549,
D,,0.501249,1.012739,0.278741


Al guardarse el DataFrame en un csv losvalores Nan quedan como casillas vacías. Al contrario, también, las casillas vacías se convierten en Nan

In [98]:
guardable.to_csv("guardable.csv")

Filtrado de todo el DataFrame utilizando sólo los valores de una columna

In [99]:
print(df['W']>0)


A     True
B    False
D    False
Name: W, dtype: bool


In [100]:
df_filtrado = df[df['W']>0]


In [101]:
df_filtrado

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304


In [102]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [103]:
print(df['W']>0)


A     True
B    False
D    False
Name: W, dtype: bool


In [105]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [104]:
df[df['W']>0]['Y']

A    0.712265
Name: Y, dtype: float64

In [106]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
A,0.712265,-1.977888


In [111]:
def resaltar_positivos(s):
    return ['background-color: yellow' if s.name in df[df['W'] > 0].index else '' for _ in s]

df[['Y', 'X']].style.apply(resaltar_positivos, axis=1)

Unnamed: 0,Y,X
A,0.712265,-1.977888
B,0.179549,0.034142
D,1.012739,0.501249


Podemos concatenar condiciones con `|` y `&`.  Deberemos encerrar entre paréntesis cada una de las condiciones:

In [107]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [None]:
df[(df['W']>0) & (df['Y'] < 0.9)]
# va buscar los dos condiciones juntas y coloca el resultado en un nuevo dataframe

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304


In [110]:
df[(df['W']>0) | (df['Y'] > 0.5)]

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
D,-0.735217,0.501249,1.012739,0.278741


### Filtrar fila por condición

In [112]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [113]:
df.loc['A'] > 0

W     True
X    False
Y     True
Z     True
Name: A, dtype: bool

In [114]:
df.loc[:,df.loc['A'] > 0]

Unnamed: 0,W,Y,Z
A,0.642055,0.712265,2.598304
B,-0.024626,0.179549,-1.861976
D,-0.735217,1.012739,0.278741


In [115]:
df.loc['A'][df.loc['A'] > 0]

W    0.642055
Y    0.712265
Z    2.598304
Name: A, dtype: float64

In [116]:
df.iloc[0][df.iloc[0] > 0]

W    0.642055
Y    0.712265
Z    2.598304
Name: A, dtype: float64

## Más sobre índices

In [117]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


### Reseteamos el índice a una secuencia de 0 a n


In [None]:
df.reset_index()

### creamos un nuevo indice


In [119]:
nuevoindice = 'CA NY WY OR CO'.split()

In [120]:
nuevoindice

['CA', 'NY', 'WY', 'OR', 'CO']

In [121]:
df

Unnamed: 0,W,X,Y,Z
A,0.642055,-1.977888,0.712265,2.598304
B,-0.024626,0.034142,0.179549,-1.861976
D,-0.735217,0.501249,1.012739,0.278741


In [122]:
df = pd.DataFrame(randn(5,4),index=range(5), columns=columnas)
df

Unnamed: 0,W,X,Y,Z
0,-0.275786,-0.552108,0.120747,0.748216
1,1.608691,-0.270232,0.812341,0.49974
2,0.474347,-0.563924,-0.997321,-1.100043
3,-0.756437,0.321687,0.760949,0.323469
4,-0.548955,1.80597,1.518866,-0.354


In [124]:
df['Estados'] = nuevoindice
df

Unnamed: 0,W,X,Y,Z,Estados
0,-0.275786,-0.552108,0.120747,0.748216,CA
1,1.608691,-0.270232,0.812341,0.49974,NY
2,0.474347,-0.563924,-0.997321,-1.100043,WY
3,-0.756437,0.321687,0.760949,0.323469,OR
4,-0.548955,1.80597,1.518866,-0.354,CO


Utilizamos la columna Estados como índice en el dataset

In [125]:
df.set_index('Estados')

Unnamed: 0_level_0,W,X,Y,Z
Estados,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.275786,-0.552108,0.120747,0.748216
NY,1.608691,-0.270232,0.812341,0.49974
WY,0.474347,-0.563924,-0.997321,-1.100043
OR,-0.756437,0.321687,0.760949,0.323469
CO,-0.548955,1.80597,1.518866,-0.354


Tenemos que tener en cuenta que si no usamos el argumento inplace, no se aplican los cambios

In [126]:
df

Unnamed: 0,W,X,Y,Z,Estados
0,-0.275786,-0.552108,0.120747,0.748216,CA
1,1.608691,-0.270232,0.812341,0.49974,NY
2,0.474347,-0.563924,-0.997321,-1.100043,WY
3,-0.756437,0.321687,0.760949,0.323469,OR
4,-0.548955,1.80597,1.518866,-0.354,CO


In [127]:
df.set_index('Estados', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
Estados,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-0.275786,-0.552108,0.120747,0.748216
NY,1.608691,-0.270232,0.812341,0.49974
WY,0.474347,-0.563924,-0.997321,-1.100043
OR,-0.756437,0.321687,0.760949,0.323469
CO,-0.548955,1.80597,1.518866,-0.354


In [128]:
df.loc["NY"]

W    1.608691
X   -0.270232
Y    0.812341
Z    0.499740
Name: NY, dtype: float64

## Índices múltiples y jerarquía en los índices

### Creamos diferentes 'índices'


In [129]:
outside = ['G1','G1','G1','G2','G2','G3']


In [130]:
inside = [1,2,3,1,2,1]


In [131]:
hier_index = list(zip(outside,inside))


In [132]:
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [134]:
print ("outside= ", outside)
print ("inside= ", inside)
print (hier_index)


outside=  ['G1', 'G1', 'G1', 'G2', 'G2', 'G3']
inside=  [1, 2, 3, 1, 2, 1]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G3', 1)],
           )


In [135]:
df = pd.DataFrame(np.random.randn(6,2), index=hier_index, columns=['A','B'])


In [136]:

df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.823431,0.130215
G1,2,1.267299,0.332765
G1,3,0.556549,-0.21208
G2,1,0.456271,1.544544
G2,2,-0.239669,0.143308
G3,1,0.253816,0.283725


¿Cómo extraemos los datos en base a este índice doble?

In [137]:
# Haciendo uso de .loc
df.loc['G1']

Unnamed: 0,A,B
1,-0.823431,0.130215
2,1.267299,0.332765
3,0.556549,-0.21208


In [138]:
df.loc['G1'].iloc[0]

A   -0.823431
B    0.130215
Name: 1, dtype: float64

### eliminamos una fila


In [139]:
df.loc['G1'].iloc[1:]

Unnamed: 0,A,B
2,1.267299,0.332765
3,0.556549,-0.21208


Podemos entender G1 y G2 como una columna extra que se usa para el filtrado.
Además a los índices podemos asignarles nombres

In [140]:
df.index.names

FrozenList([None, None])

In [141]:
df.index.names = ['Grupo','Número']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Grupo,Número,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.823431,0.130215
G1,2,1.267299,0.332765
G1,3,0.556549,-0.21208
G2,1,0.456271,1.544544
G2,2,-0.239669,0.143308
G3,1,0.253816,0.283725


### Filtrar por el índice de nombre "Número"


In [142]:
df.xs(1,level='Número')

Unnamed: 0_level_0,A,B
Grupo,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.823431,0.130215
G2,0.456271,1.544544
G3,0.253816,0.283725


# Valores perdidos

In [143]:
import numpy as np
import pandas as pd

In [150]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})


In [145]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
 1   B       1 non-null      float64
 2   C       3 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 200.0 bytes


In [147]:
df["A"].std()

0.7071067811865476

In [148]:
df.describe()

Unnamed: 0,A,B,C
count,2.0,1.0,3.0
mean,1.5,5.0,2.0
std,0.707107,,1.0
min,1.0,5.0,1.0
25%,1.25,5.0,1.5
50%,1.5,5.0,2.0
75%,1.75,5.0,2.5
max,2.0,5.0,3.0


In [151]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [152]:
df.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [153]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [154]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [155]:
df.dropna(thresh=2, axis=1)

Unnamed: 0,A,C
0,1.0,1
1,2.0,2
2,,3


In [156]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [157]:
df.fillna(value='Valor Rellenado', inplace=True)

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      object
 1   B       3 non-null      object
 2   C       3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [159]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,Valor Rellenado,2
2,Valor Rellenado,Valor Rellenado,3


In [160]:
df["A"] + df["B"]

TypeError: unsupported operand type(s) for +: 'float' and 'str'

Un caso más elaborado (y habitual), sería el de imputar la media de su columna a los NA

In [162]:
pd.DataFrame(data = np.random.rand(5,5), columns = 'A B C D E'.split())

Unnamed: 0,A,B,C,D,E
0,0.909872,0.128631,0.08178,0.138416,0.399379
1,0.424307,0.562218,0.122244,0.2014,0.811644
2,0.467988,0.807938,0.007426,0.551593,0.931932
3,0.582175,0.206096,0.717758,0.378986,0.668384
4,0.02932,0.6359,0.032198,0.744781,0.472913


In [163]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})


In [165]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [166]:
df["B"].fillna(value=df["C"])

0    5.0
1    2.0
2    3.0
Name: B, dtype: float64

In [167]:
media = df['A'].mean()

In [168]:
media

1.5

In [169]:
df["A"] = df['A'].fillna(value=media)

In [170]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3


In [172]:
df["B"] = df['B'].fillna(value=df['B'].max())

In [173]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


# Conceptos Básicos Complementarios

Contenido:
1. Métodos útiles para explorar datos
2. Operaciones comunes con columnas
3. Indexación avanzada
4. Aplicación de funciones con `apply`
5. Ordenamiento de datos

In [None]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np

In [None]:
# Crear un DataFrame de ejemplo
data = {
    'Nombre': ['Ana', 'Luis', 'María', 'Pedro', 'Sofía'],
    'Edad': [23, 45, 34, 25, 40],
    'Nota': [8.5, 6.0, 7.5, 9.0, 6.5]
}

df = pd.DataFrame(data)
df

## 1. Exploración de Datos



In [None]:
# Primeras filas del DataFrame
df.head()

In [None]:
# Información general del DataFrame
df.info()

In [None]:
# Estadísticas descriptivas
df.describe()

## 2. Operaciones con Columnas



In [None]:
# Crear una nueva columna
df['Nota_ajustada'] = df['Nota'] * 1.1  # Incrementar la nota en un 10%
df

In [None]:
# Eliminar una columna
df.drop('Nota_ajustada', axis=1, inplace=False)

## 3. Indexación Avanzada

In [None]:
# Selección por condición
df[df['Edad'] > 30]  # Filtrar estudiantes mayores de 30 años

In [None]:
# Uso de loc y iloc
df.loc[1:3, ['Nombre', 'Nota']]  # Filtrar por etiquetas
df.iloc[1:3, 1:3]  # Filtrar por posiciones

## 4. Aplicación de Funciones con `apply`

In [None]:
# Aplicar una función a una columna
def clasificar_nota(nota):
    return 'Alta' if nota >= 7 else 'Baja'

df['Clasificación'] = df['Nota'].apply(clasificar_nota)
df

## 5. Ordenamiento de Datos

In [None]:
# Ordenar por una columna
df.sort_values(by='Edad', ascending=True)


In [None]:
# Ordenar por múltiples columnas
df.sort_values(by=['Clasificación', 'Edad'], ascending=[True, False])