# Jugando con Pandas (contenido opcional)

Pandas es una libreria que tiene una cantidad enorme de funciones. Aqui solo veremos algunas de las mas utilizadas. Como decimos siempre no hace falta memorizarlas, cada vez que necesitemos hacer algo especifico podemos buscar en internet que funcion y como debemos utilizarla.

Los ejemplos de esta seccion estan basados en la documentacion oficial de [Pandas](https://pandas.pydata.org/docs/user_guide/index.html)

In [None]:
import pandas as pd

## Crear Series y Dataframes

In [None]:
pd.Series([1, 3, 5, np.nan, 6, 8])

In [None]:
pd.DataFrame({'A': [1, 2, 3]})

In [None]:
dates = pd.date_range("20130101", periods=6)

In [None]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

In [None]:
df

In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [None]:
df2

## Inspeccionar elementos

In [None]:
df.head()

In [None]:
df.tail(3)

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.to_numpy()

In [None]:
df.describe()

In [None]:
df.T

In [None]:
#ordenar por nombre de columna
df.sort_index(axis=1, ascending=False)

In [None]:
#ordenar por valores de columna
df.sort_values(by="B")

## Seleccionar

In [None]:
df["A"]

In [None]:
df[0:3]

In [None]:
df.loc["20130102":"20130104", ["A", "B"]]

In [None]:
df.iloc[3:5, 0:2]

In [None]:
df.at[dates[0], "A"]

In [None]:
df.iat[1, 1]

In [None]:
df[df["A"] > 0]

In [None]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

In [None]:
df2[df2["E"].isin(["two", "four"])]

## Modificar valores

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
df["F"] = s1

In [None]:
df.at[dates[0], "A"] = 0

In [None]:
df.iat[0, 1] = 0

In [None]:
df.loc[:, "D"] = np.array([5] * len(df))

In [None]:
df

In [None]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

## Valores faltantes

In [None]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

In [None]:
df1.dropna(how="any")

In [None]:
df1.fillna(value=5)

In [None]:
pd.isna(df1)

## Operaciones

In [None]:
df.mean() #eje 0 por columnas

In [None]:
df.mean(1) #eje 1 por filas

In [None]:
df.apply(np.cumsum, axis=0)

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
s.value_counts()

In [None]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

## Combinar dataframes

In [None]:
left = pd.DataFrame({"lkey": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"rkey": ["foo", "bar"], "rval": [4, 5]})
print(left)
print(right)

In [None]:
new_df = left.join(right)
new_df

In [None]:
right

In [None]:
top = pd.DataFrame({"lkey": ["foo", "bar"], "lval": [1, 2]})
down = pd.DataFrame({"rkey": ["foo", "bar"], "rval": [4, 5]})
pd.concat([top, down], axis=0)

In [None]:
pd.concat([top, down], axis=1)

In [None]:
top = pd.DataFrame({"key": ["foo1", "bar1"], "val": [1, 2]})
down = pd.DataFrame({"key": ["foo2", "bar2"], "val": [4, 5]})
pd.concat([top, down], axis=0)

In [None]:
top = pd.DataFrame({"key": ["foo1", "bar1"], "val": [1, 2]})
down = pd.DataFrame({"key": ["foo2", "bar2"], "val": [4, 5]})
pd.concat([top, down], axis=1)

## Combinar dataframes con una columna en comun

In [None]:
left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
print(left)
print(right)

In [None]:
pd.merge(left, right, on="key")

In [None]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["bar", "bar"], "rval": [4, 5]})
print(left)
print(right)

In [None]:
pd.merge(left, right, on="key")

## Agrupar

In [None]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

In [None]:
df_group = df.groupby("A")[["C", "D"]]
df_group.sum()

In [None]:
type(df_group)

In [None]:
type(df_group.sum())

In [None]:
df.groupby("A").mean()

In [None]:
df.groupby("Country")["Rate"].mean().max()

In [None]:
df.groupby("A")["C"].mean().max()

In [None]:
df.groupby(["A", "B"]).sum()

## Reformar dataframes

In [None]:
tuples = list(
    zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )
)
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df2 = df[:4]
df2

In [None]:
stacked = df2.stack()
stacked

In [None]:
stacked.unstack()

In [None]:
stacked.unstack(1)

In [None]:
stacked.unstack(0)

## Tablas pivot

In [None]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

In [None]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

## Atributos categoricos

In [None]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

In [None]:
new_categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.rename_categories(new_categories)
df

In [None]:
df["grade"] = df["grade"].cat.set_categories(
    ["very bad", "bad", "medium", "good", "very good"]
)
df["grade"]

In [None]:
df.sort_values(by="grade") #ordena por la posicion seteada de las categorias 

In [None]:
df.groupby("grade").size() #groupby muestra las categorias vacias

## Leer y escribir en archivos

In [None]:
df.to_csv("foo.csv")

In [None]:
pd.read_csv("foo.csv")

In [None]:
df.to_excel("foo.xlsx", sheet_name="Sheet1")
pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])

# Ejercicios

## Crear un Dataframe
Crear un dataframe con dos columnas "A" y "B"
La columna A contiene los siguientes nombres: "Alicia", "Beto", "Camila"
La columna B contiene las respectivas edades: 31,42,35

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df = pd.DataFrame({'A': ["Alicia", "Beto", "Camila"], 'B':[31,42,35]})
df

## Agregar una columna
Agregar la columna "C" con los siguientes datos 1.7,1.72,1.67

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df["C"] = [1.7,1.72,1.67]
df

## Cambiar el indice
Cambiar el indice de 0,1,2 por "a", "b", "c"

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.index = ["a","b","c"]
df

## Cambiar el nombre de las columnas
Cambiar el nombre de las columnas A, B, C por "Nombre", "Edad", "Estatura"

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.columns = ["Nombre", "Edad", "Estatura"]
df

## Mostrar la columna "Nombre"
Utilizar la funcion loc

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.loc[:,"Nombre"]

## Mostrar elementos

Mostrar el elemento del indice "b" y columna "Nombre" usando loc

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.loc["b","Nombre"]

Mostrar el elemento de la fila 1 y la columna 0 usando iloc

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.iloc[1,0]

Mostrar la fila 1 usando iloc

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df.iloc[1,:]

## Ordenar dataframe
Ordenar el dataframe por la altura de menor a mayor

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df = df.sort_values(by="Estatura")
df

## Agregar datos al dataframe

In [None]:
df2 = pd.DataFrame({'Nombre': ["Diana", "Ernesto", "Francisco"], 'Edad':[26,24,53], 'Estatura':[1.8,1.76,np.nan]})
df2

Agregar los datos del dataframe df2 a df

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df = df.append(df2)
df

## Reemplazar valores faltantes
Reemplazar los valores faltantes en la columna Estatura por la media de dicha columna

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df = df.fillna(np.round(df.Estatura.mean(),decimals=2))
df

## Mostrar usando condicionales en los indices
Mostar las filas donde la altura es mayor o igual a la media

In [None]:
# Escribir aqui la solucion



In [None]:
#@title Solucion {display-mode:"form"}

df[df.Estatura >= np.round(df.Estatura.mean(),decimals=2)]

# Fin: [Volver al contenido del curso](https://www.freecodingtour.com/cursos/espanol/datascience/datascience.html)