# Introducción a Pandas

Aprenderemos a utilizar pandas para el análisis de datos:

* Introducción a Pandas
* Series
* DataFrames
* Valores perdidos

___

In [None]:
"""
!pip install numpy
!pip install pandas
"""



In [None]:
import numpy as np
import pandas as pd

# Series

### Creando Series

Conversión de una lista, Array Numpy o diccionario a Series.

Primero, creamos las variables:

In [None]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array((10,20,30))
d = {'a':10,'b':20,'c':30}

print (labels, type(labels))
print (my_list, type(my_list))
print (arr, type(arr))
print (d, type(d))

['a', 'b', 'c'] <class 'list'>
[10, 20, 30] <class 'list'>
[10 20 30] <class 'numpy.ndarray'>
{'a': 10, 'b': 20, 'c': 30} <class 'dict'>


### Usando listas

In [None]:
serie_de_lista = pd.Series(data=my_list)
print(serie_de_lista, type(serie_de_lista))

0    10
1    20
2    30
dtype: int64 <class 'pandas.core.series.Series'>


In [None]:
serie_de_lista[1]

np.int64(20)

In [None]:
# Asignamos etiquetas a la serie
serie_de_lista = pd.Series(data=my_list, index=labels)
print(serie_de_lista, type(serie_de_lista))

a    10
b    20
c    30
dtype: int64 <class 'pandas.core.series.Series'>


In [None]:
serie_de_lista["b"]

np.int64(20)

Los parámetros data e index se pueden introducir por posición

In [None]:
serie_de_lista = pd.Series(my_list, labels)
print(serie_de_lista, type(serie_de_lista))

a    10
b    20
c    30
dtype: int64 <class 'pandas.core.series.Series'>


### Usando Arrays

In [None]:
serie_de_array = pd.Series(arr, labels)
print(serie_de_array, type(serie_de_array))

a    10
b    20
c    30
dtype: int64 <class 'pandas.core.series.Series'>


### Usando Diccionarios

In [None]:
serie_de_diccionario = pd.Series(d)
print(serie_de_diccionario, type(serie_de_diccionario))

a    10
b    20
c    30
dtype: int64 <class 'pandas.core.series.Series'>


### Usando índices

La clave para usar Series, es entender sus índices. Pandas puede usar índices en formato numéricos o en formato texto.

In [None]:
ser1 = pd.Series(np.random.rand(4), index = ['EEUU', 'Alemania','Rusia', 'Japón'])
ser2 = pd.Series(np.random.rand(4), index = ['EEUU', 'Alemania','Italia', 'Japón'])
print (ser1)
print ()
print (ser2)

EEUU        0.711566
Alemania    0.476725
Rusia       0.198772
Japón       0.832871
dtype: float64

EEUU        0.013746
Alemania    0.519628
Italia      0.091854
Japón       0.821003
dtype: float64


En los notebooks puedo ver el contenido de un objeto de pandas con el método display.

In [None]:
display(ser1)
display(ser2)

Unnamed: 0,0
EEUU,0.711566
Alemania,0.476725
Rusia,0.198772
Japón,0.832871


Unnamed: 0,0
EEUU,0.013746
Alemania,0.519628
Italia,0.091854
Japón,0.821003


Las series de Python usan la sintaxis de los diccionarios para acceder a los valores.

Pero ahora, la clave de acceso puede ser una lista de índices en lugar de sólo una clave de diccionario.

In [None]:
print(ser1['EEUU'])


0.7115655870770612


In [None]:
print("Como serie de índices")
print(ser1[['EEUU']])


Como serie de índices
EEUU    0.711566
dtype: float64


In [None]:

print(ser1[['Rusia','EEUU']])


Rusia    0.198772
EEUU     0.711566
dtype: float64


In [None]:

print(ser1[['Rusia','EEUU', 'Japón']])

Rusia    0.198772
EEUU     0.711566
Japón    0.832871
dtype: float64


## Operaciones con series

In [None]:
ser1

Unnamed: 0,0
EEUU,0.711566
Alemania,0.476725
Rusia,0.198772
Japón,0.832871


In [None]:
ser2

Unnamed: 0,0
EEUU,0.013746
Alemania,0.519628
Italia,0.091854
Japón,0.821003


Las operaciones se realizan en función del índice

In [None]:
suma = ser1 + ser2

In [None]:
suma

Unnamed: 0,0
Alemania,0.996353
EEUU,0.725312
Italia,
Japón,1.653873
Rusia,


In [None]:
ser1 * ser2

Unnamed: 0,0
Alemania,0.24772
EEUU,0.009781
Italia,
Japón,0.683789
Rusia,


In [None]:
type(ser1 + ser2)

# DataFrames

Podemos ver un DataFrame como un conjunto de objetos `Series` unidos.

In [None]:
import pandas as pd
import numpy as np
from numpy.random import randn
np.random.seed(123)

In [None]:
indice = 'A B C D E'.split()
columnas = 'W X Y Z'.split()
print(indice, columnas)

['A', 'B', 'C', 'D', 'E'] ['W', 'X', 'Y', 'Z']


In [None]:
df = pd.DataFrame(randn(5,4),index=indice, columns=columnas)
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351
E,2.20593,2.186786,1.004054,0.386186


## Selección e indexación


### Selección de columnas

In [None]:
df['W']

Unnamed: 0,W
A,-1.085631
B,-0.5786
C,1.265936
D,1.49139
E,2.20593


In [None]:
type(df)

Cuando extraigo una única columna, o fila, obtengo un objeto `Series`

In [None]:
type(df['W'])

Como vemos la columna W es simplemente una Serie

Pero si selecciono dos, sigue devolviendo una tabla, es decir, un DataFrame

In [None]:
# Selección de varias columnas por su nombre
lista_col = ["Z", "W"]
df[lista_col]

Unnamed: 0,Z,W
A,-1.506295,-1.085631
B,-0.428913,-0.5786
C,-0.094709,1.265936
D,-0.434351,1.49139
E,0.386186,2.20593


Pandas también permite sintaxis tipo SQL, no obstante no se recomienda su uso.

In [None]:
df.W

Unnamed: 0,W
A,-1.085631
B,-0.5786
C,1.265936
D,1.49139
E,2.20593


### Selección de filas

In [None]:
# Para selección filas con etiquetas
df.loc['B']

Unnamed: 0,B
W,-0.5786
X,1.651437
Y,-2.426679
Z,-0.428913


In [None]:
type(df.loc['B'])

In [None]:
# Para selección filas por índices
df.iloc[3]

Unnamed: 0,D
W,1.49139
X,-0.638902
Y,-0.443982
Z,-0.434351


### Selección de un subset de datos

In [None]:
df.loc[['A','C'],['W','Y']]


Unnamed: 0,W,Y
A,-1.085631,0.282978
C,1.265936,-0.678886


In [None]:
# Selección de 2 filas y todas las columnas.
df.loc[['A','B'],]

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913


In [None]:
df.iloc[:2][:]

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913


In [None]:
df.iloc[:2,1:]

Unnamed: 0,X,Y,Z
A,0.997345,0.282978,-1.506295
B,1.651437,-2.426679,-0.428913


In [None]:
df.iloc[0:2,:]

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913


## Creando una nueva columna

In [None]:
df['nueva'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,nueva
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408
E,2.20593,2.186786,1.004054,0.386186,3.209984


In [None]:
df['clase'] = 0
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652,0
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279,0
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705,0
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408,0
E,2.20593,2.186786,1.004054,0.386186,3.209984,0


## Eliminar columnas

In [None]:
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652,0
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279,0
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705,0
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408,0
E,2.20593,2.186786,1.004054,0.386186,3.209984,0


In [None]:
df.drop(columns="nueva")

Unnamed: 0,W,X,Y,Z,clase
A,-1.085631,0.997345,0.282978,-1.506295,0
B,-0.5786,1.651437,-2.426679,-0.428913,0
C,1.265936,-0.86674,-0.678886,-0.094709,0
D,1.49139,-0.638902,-0.443982,-0.434351,0
E,2.20593,2.186786,1.004054,0.386186,0


In [None]:
df.drop('nueva', axis=1)

Unnamed: 0,W,X,Y,Z,clase
A,-1.085631,0.997345,0.282978,-1.506295,0
B,-0.5786,1.651437,-2.426679,-0.428913,0
C,1.265936,-0.86674,-0.678886,-0.094709,0
D,1.49139,-0.638902,-0.443982,-0.434351,0
E,2.20593,2.186786,1.004054,0.386186,0


Pero el dataframe continúa íntegro

In [None]:
df

Unnamed: 0,W,X,Y,Z,nueva,clase
A,-1.085631,0.997345,0.282978,-1.506295,-0.802652,0
B,-0.5786,1.651437,-2.426679,-0.428913,-3.005279,0
C,1.265936,-0.86674,-0.678886,-0.094709,0.58705,0
D,1.49139,-0.638902,-0.443982,-0.434351,1.047408,0
E,2.20593,2.186786,1.004054,0.386186,3.209984,0


Hay 2 maneras diferentes de borrar una columna y que el cambio se guarde en el DataFrame original
```
df = df.drop('nombre_columna',axis=1)
```
o bien:
```
df.drop('nombre_columna',axis=1, inplace=True)
```


In [None]:
df.drop('nueva',axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z,clase
A,-1.085631,0.997345,0.282978,-1.506295,0
B,-0.5786,1.651437,-2.426679,-0.428913,0
C,1.265936,-0.86674,-0.678886,-0.094709,0
D,1.49139,-0.638902,-0.443982,-0.434351,0
E,2.20593,2.186786,1.004054,0.386186,0


In [None]:
df = df.drop('clase', axis=1)
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351
E,2.20593,2.186786,1.004054,0.386186


Si intento borrar una columna que ya no existe obtengo un error

In [None]:
del df["clase"]

KeyError: 'clase'

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351
E,2.20593,2.186786,1.004054,0.386186


## Eliminar filas

In [None]:
df.drop('D')

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
E,2.20593,2.186786,1.004054,0.386186


A menos que lo especifiquemos con inplace, no se elimina nada

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351
E,2.20593,2.186786,1.004054,0.386186


Hay 2 maneras diferentes de borrar una fila y que el cambio se guarde en el DataFrame original
```
df = df.drop('D',axis=0)
```
o bien:
```
df.drop('D',axis=0, inplace=True)
```





**Eliminar filas**

In [None]:
df.drop('E',axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
C,1.265936,-0.86674,-0.678886,-0.094709
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df.drop(index="C", inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df.shape

(3, 4)

Número de filas

In [None]:
df.shape[0]

3

Número de columnas

In [None]:
df.shape[1]

4

## Filtros

### Selección condicional


Una importante característica de Pandas es la selección condicional de manera muy similar a Numpy:

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,True,True,False
B,False,True,False,False
D,True,False,False,False


Este dataframe booleano funciona como un filtro

In [None]:
guardable = df[df>0]
guardable

Unnamed: 0,W,X,Y,Z
A,,0.997345,0.282978,
B,,1.651437,,
D,1.49139,,,


Al guardarse el DataFrame en un csv losvalores Nan quedan como casillas vacías. Al contrario, también, las casillas vacías se convierten en Nan

In [None]:
guardable.to_csv("guardable.csv")

Filtrado de todo el DataFrame utilizando sólo los valores de una columna

In [None]:
print(df['W']>0)
df_filtrado = df[df['W']>0]
df_filtrado

A    False
B    False
D     True
Name: W, dtype: bool


Unnamed: 0,W,X,Y,Z
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
print(df['W']>0)
df[df['W']>0]['Y']

A    False
B    False
D     True
Name: W, dtype: bool


Unnamed: 0,Y
D,-0.443982


In [None]:
df[df['W']>0][['Y','X']]

Unnamed: 0,Y,X
D,-0.443982,-0.638902


Podemos concatenar condiciones con `|` y `&`.  Deberemos encerrar entre paréntesis cada una de las condiciones:

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df[(df['W']>0) & (df['Y'] < -0.9)]

Unnamed: 0,W,X,Y,Z


In [None]:
df[(df['W']>0) | (df['Y'] > 0.5)]

Unnamed: 0,W,X,Y,Z
D,1.49139,-0.638902,-0.443982,-0.434351


### Filtrar fila por condición

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df.loc['A'] > 0

Unnamed: 0,A
W,False
X,True
Y,True
Z,False


In [None]:
df.loc[:,df.loc['A'] > 0]

Unnamed: 0,X,Y
A,0.997345,0.282978
B,1.651437,-2.426679
D,-0.638902,-0.443982


In [None]:
df.loc['A'][df.loc['A'] > 0]

Unnamed: 0,A
X,0.997345
Y,0.282978


In [None]:
df.iloc[0][df.iloc[0] > 0]

Unnamed: 0,A
X,0.997345
Y,0.282978


## Más sobre índices

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
# Reseteamos el índice a una secuencia de 0 a n
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-1.085631,0.997345,0.282978,-1.506295
1,B,-0.5786,1.651437,-2.426679,-0.428913
2,D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
nuevoindice = 'CA NY WY OR CO'.split()

In [None]:
nuevoindice

['CA', 'NY', 'WY', 'OR', 'CO']

In [None]:
df

Unnamed: 0,W,X,Y,Z
A,-1.085631,0.997345,0.282978,-1.506295
B,-0.5786,1.651437,-2.426679,-0.428913
D,1.49139,-0.638902,-0.443982,-0.434351


In [None]:
df = pd.DataFrame(randn(5,4),index=range(5), columns=columnas)
df

Unnamed: 0,W,X,Y,Z
0,0.737369,1.490732,-0.935834,1.175829
1,-1.253881,-0.637752,0.907105,-1.428681
2,-0.140069,-0.861755,-0.255619,-2.798589
3,-1.771533,-0.699877,0.927462,-0.173636
4,0.002846,0.688223,-0.879536,0.283627


In [None]:
df['Estados'] = nuevoindice
df

Unnamed: 0,W,X,Y,Z,Estados
0,0.737369,1.490732,-0.935834,1.175829,CA
1,-1.253881,-0.637752,0.907105,-1.428681,NY
2,-0.140069,-0.861755,-0.255619,-2.798589,WY
3,-1.771533,-0.699877,0.927462,-0.173636,OR
4,0.002846,0.688223,-0.879536,0.283627,CO


Utilizamos la columna Estados como índice en el dataset

In [None]:
df.set_index('Estados')

Unnamed: 0_level_0,W,X,Y,Z
Estados,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.737369,1.490732,-0.935834,1.175829
NY,-1.253881,-0.637752,0.907105,-1.428681
WY,-0.140069,-0.861755,-0.255619,-2.798589
OR,-1.771533,-0.699877,0.927462,-0.173636
CO,0.002846,0.688223,-0.879536,0.283627


Tenemos que tener en cuenta que si no usamos el argumento inplace, no se aplican los cambios

In [None]:
df

Unnamed: 0,W,X,Y,Z,Estados
0,0.737369,1.490732,-0.935834,1.175829,CA
1,-1.253881,-0.637752,0.907105,-1.428681,NY
2,-0.140069,-0.861755,-0.255619,-2.798589,WY
3,-1.771533,-0.699877,0.927462,-0.173636,OR
4,0.002846,0.688223,-0.879536,0.283627,CO


In [None]:
df.set_index('Estados', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
Estados,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.737369,1.490732,-0.935834,1.175829
NY,-1.253881,-0.637752,0.907105,-1.428681
WY,-0.140069,-0.861755,-0.255619,-2.798589
OR,-1.771533,-0.699877,0.927462,-0.173636
CO,0.002846,0.688223,-0.879536,0.283627


In [None]:
df.loc["NY"]

Unnamed: 0,NY
W,-1.253881
X,-0.637752
Y,0.907105
Z,-1.428681


## Índices múltiples y jerarquía en los índices

In [None]:
# Creamos diferentes 'índices'
outside = ['G1','G1','G1','G2','G2','G3']
inside = [1,2,3,1,2,1]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [None]:
print (outside)
print (inside)
print (hier_index)


['G1', 'G1', 'G1', 'G2', 'G2', 'G3']
[1, 2, 3, 1, 2, 1]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G3', 1)],
           )


In [None]:
df = pd.DataFrame(np.random.randn(6,2), index=hier_index, columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.805367,-1.727669
G1,2,-0.3909,0.573806
G1,3,0.338589,-0.01183
G2,1,2.392365,0.412912
G2,2,0.978736,2.238143
G3,1,-1.294085,-1.038788


¿Cómo extraemos los datos en base a este índice doble?

In [None]:
# Haciendo uso de .loc
df.loc['G1']

Unnamed: 0,A,B
1,-0.805367,-1.727669
2,-0.3909,0.573806
3,0.338589,-0.01183


In [None]:
df.loc['G1'].iloc[0]

Unnamed: 0,1
A,-0.805367
B,-1.727669


In [None]:
df.loc['G1'].iloc[1:]

Unnamed: 0,A,B
2,-0.3909,0.573806
3,0.338589,-0.01183


Podemos entender G1 y G2 como una columna extra que se usa para el filtrado.
Además a los índices podemos asignarles nombres

In [None]:
df.index.names

FrozenList([None, None])

In [None]:
df.index.names = ['Grupo','Número']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Grupo,Número,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.805367,-1.727669
G1,2,-0.3909,0.573806
G1,3,0.338589,-0.01183
G2,1,2.392365,0.412912
G2,2,0.978736,2.238143
G3,1,-1.294085,-1.038788


In [None]:
# Filtrar por el índice de nombre "Número"
df.xs(1,level='Número')

Unnamed: 0_level_0,A,B
Grupo,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.805367,-1.727669
G2,2.392365,0.412912
G3,-1.294085,-1.038788


# Valores perdidos

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
 1   B       1 non-null      float64
 2   C       3 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 204.0 bytes


In [None]:
df["A"].std()

0.7071067811865476

In [None]:
df.describe()

Unnamed: 0,A,B,C
count,2.0,1.0,3.0
mean,1.5,5.0,2.0
std,0.707107,,1.0
min,1.0,5.0,1.0
25%,1.25,5.0,1.5
50%,1.5,5.0,2.0
75%,1.75,5.0,2.5
max,2.0,5.0,3.0


In [None]:
df.dropna(axis=0)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [None]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [None]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [None]:
df.dropna(thresh=2, axis=1)

Unnamed: 0,A,C
0,1.0,1
1,2.0,2
2,,3


In [None]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [None]:
df.fillna(value='Valor Rellenado', inplace=True)

  df.fillna(value='Valor Rellenado', inplace=True)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       3 non-null      object
 1   B       3 non-null      object
 2   C       3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [None]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,Valor Rellenado,2
2,Valor Rellenado,Valor Rellenado,3


In [None]:
df["A"] + df["B"]

TypeError: unsupported operand type(s) for +: 'float' and 'str'

Un caso más elaborado (y habitual), sería el de imputar la media de su columna a los NA

In [None]:
pd.DataFrame(data = np.random.rand(5,5), columns = 'A B C D E'.split())

Unnamed: 0,A,B,C,D,E
0,0.417022,0.681301,0.875457,0.510422,0.669314
1,0.585937,0.624904,0.674689,0.842342,0.083195
2,0.763683,0.243666,0.194223,0.572457,0.095713
3,0.885327,0.627249,0.723416,0.016129,0.594432
4,0.556785,0.15896,0.153071,0.69553,0.318766


In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [None]:
df["B"].fillna(value=df["C"])

Unnamed: 0,B
0,5.0
1,2.0
2,3.0


In [None]:
media = df['A'].mean()

In [None]:
media

np.float64(1.5)

In [None]:
df["A"] = df['A'].fillna(value=media)

In [None]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,1.5,,3


In [None]:
df["B"] = df['B'].fillna(value=df['B'].max())

In [None]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


# Conceptos Básicos Complementarios

Contenido:
1. Métodos útiles para explorar datos
2. Operaciones comunes con columnas
3. Indexación avanzada
4. Aplicación de funciones con `apply`
5. Ordenamiento de datos

In [None]:
# Importar las librerías necesarias
import pandas as pd
import numpy as np

In [None]:
# Crear un DataFrame de ejemplo
data = {
    'Nombre': ['Ana', 'Luis', 'María', 'Pedro', 'Sofía'],
    'Edad': [23, 45, 34, 25, 40],
    'Nota': [8.5, 6.0, 7.5, 9.0, 6.5]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Nombre,Edad,Nota
0,Ana,23,8.5
1,Luis,45,6.0
2,María,34,7.5
3,Pedro,25,9.0
4,Sofía,40,6.5


## 1. Exploración de Datos



In [None]:
# Primeras filas del DataFrame
df.head()

Unnamed: 0,Nombre,Edad,Nota
0,Ana,23,8.5
1,Luis,45,6.0
2,María,34,7.5
3,Pedro,25,9.0
4,Sofía,40,6.5


In [None]:
# Información general del DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Nombre  5 non-null      object 
 1   Edad    5 non-null      int64  
 2   Nota    5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes


In [None]:
# Estadísticas descriptivas
df.describe()

Unnamed: 0,Edad,Nota
count,5.0,5.0
mean,33.4,7.5
std,9.449868,1.274755
min,23.0,6.0
25%,25.0,6.5
50%,34.0,7.5
75%,40.0,8.5
max,45.0,9.0


## 2. Operaciones con Columnas



In [None]:
# Crear una nueva columna
df['Nota_ajustada'] = df['Nota'] * 1.1  # Incrementar la nota en un 10%
df

Unnamed: 0,Nombre,Edad,Nota,Nota_ajustada
0,Ana,23,8.5,9.35
1,Luis,45,6.0,6.6
2,María,34,7.5,8.25
3,Pedro,25,9.0,9.9
4,Sofía,40,6.5,7.15


In [None]:
# Eliminar una columna
df.drop('Nota_ajustada', axis=1, inplace=False)

Unnamed: 0,Nombre,Edad,Nota
0,Ana,23,8.5
1,Luis,45,6.0
2,María,34,7.5
3,Pedro,25,9.0
4,Sofía,40,6.5


## 3. Indexación Avanzada

In [None]:
# Selección por condición
df[df['Edad'] > 30]  # Filtrar estudiantes mayores de 30 años

Unnamed: 0,Nombre,Edad,Nota,Nota_ajustada
1,Luis,45,6.0,6.6
2,María,34,7.5,8.25
4,Sofía,40,6.5,7.15


In [None]:
# Uso de loc y iloc
df.loc[1:3, ['Nombre', 'Nota']]  # Filtrar por etiquetas
df.iloc[1:3, 1:3]  # Filtrar por posiciones

Unnamed: 0,Edad,Nota
1,45,6.0
2,34,7.5


## 4. Aplicación de Funciones con `apply`

In [None]:
# Aplicar una función a una columna
def clasificar_nota(nota):
    return 'Alta' if nota >= 7 else 'Baja'

df['Clasificación'] = df['Nota'].apply(clasificar_nota)
df

Unnamed: 0,Nombre,Edad,Nota,Nota_ajustada,Clasificación
0,Ana,23,8.5,9.35,Alta
1,Luis,45,6.0,6.6,Baja
2,María,34,7.5,8.25,Alta
3,Pedro,25,9.0,9.9,Alta
4,Sofía,40,6.5,7.15,Baja


## 5. Ordenamiento de Datos

In [None]:
# Ordenar por una columna
df.sort_values(by='Edad', ascending=True)


Unnamed: 0,Nombre,Edad,Nota,Nota_ajustada,Clasificación
0,Ana,23,8.5,9.35,Alta
3,Pedro,25,9.0,9.9,Alta
2,María,34,7.5,8.25,Alta
4,Sofía,40,6.5,7.15,Baja
1,Luis,45,6.0,6.6,Baja


In [None]:
# Ordenar por múltiples columnas
df.sort_values(by=['Clasificación', 'Edad'], ascending=[True, False])

Unnamed: 0,Nombre,Edad,Nota,Nota_ajustada,Clasificación
2,María,34,7.5,8.25,Alta
3,Pedro,25,9.0,9.9,Alta
0,Ana,23,8.5,9.35,Alta
1,Luis,45,6.0,6.6,Baja
4,Sofía,40,6.5,7.15,Baja
