# Dataframes
**Dataframe:** Es una estructura bidireccional donde las columnas son variables y las filas son registros u observaciones

In [261]:
# Librería para la manipulación de dataframes
import pandas as pd 

## Convertir de diccionario a dataframes

- Tener en cuenta que las ``claves`` son las ``columnas`` del dataframe
- Los ``valores`` son los registros o ``filas`` 

In [353]:
diccionario = {'variable_1': [20,30,40,10,50],
               'variable_2': [2,4,5,6,7]}
df_1 = pd.DataFrame(diccionario)
df_1

Unnamed: 0,variable_1,variable_2
0,20,2
1,30,4
2,40,5
3,10,6
4,50,7


<div class="alert alert-block alert-warning">
    <b>Nota:</b> Por defecto, al momento de crear un dataframe se genera la indexación que comienza desde 0
</div>

## Convertir de lista a dataframes

In [263]:
nombre_columnas = ['ID', 'Precio','Cantidad']
lista = [['XC01', 20000, 3],
         ['XC02',30000, 2],
         ['XC03',10000, 4]]

In [355]:
df_2 = pd.DataFrame(data = lista, 
                 columns= nombre_columnas,
                 index = ['Propiedad '+str(x+1) for x in range(len(lista))])
df_2

Unnamed: 0,ID,Precio,Cantidad
Propiedad 1,XC01,20000,3
Propiedad 2,XC02,30000,2
Propiedad 3,XC03,10000,4


### Usar la función zip () 

In [265]:
nombre_columnas = ['ID', 'Precio','Cantidad']
variable_1 = ['XC01', 'XC02', 'XC03']
variable_2 = [20000, 30000, 10000]
variable_3 = [3,2,4]

lista_completa = list(zip(variable_1, variable_2, variable_3))
lista_completa  

[('XC01', 20000, 3), ('XC02', 30000, 2), ('XC03', 10000, 4)]

In [266]:
df_zip = pd.DataFrame(data = lista_completa, columns= nombre_columnas)
df_zip

Unnamed: 0,ID,Precio,Cantidad
0,XC01,20000,3
1,XC02,30000,2
2,XC03,10000,4


## Modificar la indexación

In [267]:
df_1 

Unnamed: 0,variable_1,variable_2
0,20,2
1,30,4
2,40,5
3,10,6
4,50,7


In [268]:
df_1.index = ['Propiedad '+str(x+1) for x in range(df_1.shape[0]) ]

In [269]:
df_1

Unnamed: 0,variable_1,variable_2
Propiedad 1,20,2
Propiedad 2,30,4
Propiedad 3,40,5
Propiedad 4,10,6
Propiedad 5,50,7


## Manipulación de columnas
Podemos manipular de diversas formas:
* Mencionando el nombre de la columna ``df['nombre_columna']``
* Usando el método ``.columns[]``
* Usando el método ``.loc[]`` $\longrightarrow$ cuando llamamos por nombre o etiqueta
* Usnado el método ``.iloc[]``$\longrightarrow$ cuando llamamos por su posición

In [270]:
import pandas as pd

df = pd.read_csv('data/diabetes.csv')
df.head() #usamos para visualizar las primera 5 filas

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [271]:
df[['Age']] #Seleccionamos toda la columna 'Age'

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


### Uso de columns

In [272]:
df.columns[2] # Hacemos el llamado del nombre de la columna con indexación 2

'BloodPressure'

In [273]:
df[df.columns[[0,1,2]]]

Unnamed: 0,Pregnancies,Glucose,BloodPressure
0,6,148,72
1,1,85,66
2,8,183,64
3,1,89,66
4,0,137,40
...,...,...,...
763,10,101,76
764,2,122,70
765,5,121,72
766,1,126,60


### Uso de Loc

El método ``.loc`` tiene la siguiente sintaxis:

``
df.loc[rango de filas, rango de columnas (nombres)]
``

In [274]:
df.loc[:,['Age']]

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


In [275]:
df.loc[:,['Age','BMI']]

Unnamed: 0,Age,BMI
0,50,33.6
1,31,26.6
2,32,23.3
3,21,28.1
4,33,43.1
...,...,...
763,63,32.9
764,27,36.8
765,30,26.2
766,47,30.1


In [276]:
df.loc[1:5,['Age','BMI']]

Unnamed: 0,Age,BMI
1,31,26.6
2,32,23.3
3,21,28.1
4,33,43.1
5,30,25.6


In [277]:
df.loc[1:2,:]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [278]:
df.loc[[20],:]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
20,3,126,88,41,235,39.3,0.704,27,0


In [279]:
df.loc[[20],['Age','BMI']]

Unnamed: 0,Age,BMI
20,27,39.3


### Uso de Iloc

In [280]:
df.iloc[:,[7]]

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


In [281]:
df.iloc[:,[7,8]]

Unnamed: 0,Age,Outcome
0,50,1
1,31,0
2,32,1
3,21,0
4,33,1
...,...,...
763,63,0
764,27,0
765,30,0
766,47,1


In [282]:
df.iloc[1:5,[7,8]]

Unnamed: 0,Age,Outcome
1,31,0
2,32,1
3,21,0
4,33,1


In [283]:
df.iloc[[20],[7,8]]

Unnamed: 0,Age,Outcome
20,27,0


<div class="alert alert-block alert-info">
    <b>zip:</b> Usamos zip para poder enumerar las columnas de un dataframe 
</div>

In [284]:
list(zip(df.index,df.columns))

[(0, 'Pregnancies'),
 (1, 'Glucose'),
 (2, 'BloodPressure'),
 (3, 'SkinThickness'),
 (4, 'Insulin'),
 (5, 'BMI'),
 (6, 'DiabetesPedigreeFunction'),
 (7, 'Age'),
 (8, 'Outcome')]

## Manipulación de filas
Podemos manipular de diversas formas:
* Usando el método ``.loc[]`` $\longrightarrow$ cuando llamamos por nombre o etiqueta
* Usnado el método ``.iloc[]``$\longrightarrow$ cuando llamamos por su posición

In [285]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [286]:
df.index = ['Propiedad '+ str(x) for x in range(df.shape[0])]

### Uso de Loc

In [287]:
df.loc[['Propiedad 763'],:] #mostramos la fila con indexación 763

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0


### Uso de Iloc

In [288]:
df.iloc[[763]] #mostramos la última fila

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0


## Extraer elemento

In [289]:
df.loc['Propiedad 763', 'Glucose']

101

In [290]:
df.iloc[763,1]

101

In [291]:
df.loc[['Propiedad 762'],['Glucose']]

Unnamed: 0,Glucose
Propiedad 762,89


## Métodos en dataframes

### shape: Tamaño de un dataframe 

In [292]:
df_1

Unnamed: 0,variable_1,variable_2
Propiedad 1,20,2
Propiedad 2,30,4
Propiedad 3,40,5
Propiedad 4,10,6
Propiedad 5,50,7


El método ``.shape`` genera una tupla ``(# filas, # columnas)`` que tiene un dataframe

In [293]:
df_1.shape

(5, 2)

In [294]:
filas = df_1.shape[0] # número de filas
columnas = df_1.shape[1] # número de columnas
print('El número de filas es: {} \nEl número de columnas es: {}'.format(filas, columnas))

El número de filas es: 5 
El número de columnas es: 2


In [295]:
df_1.size # método para saber cuántos datos tiene la tabla(dataframe)

10

### copy: Duplicar un dataframe

<div class="alert alert-block alert-danger">
<b>Advertencia:</b> Evitar asignar un dataframe a una variable, ya que modificar la nueva variable implicaría modificar el dataframe original
</div>

In [296]:
df_copia = df.copy()

In [297]:
df_copia

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### rename (columns = ): Renombrar columnas

In [298]:
df_copia.rename(columns= {'Glucose': 'Glucosa',
                         'BMI': 'IMC',
                         'Age': 'Edad',
                         'Outcome': 'Target',
                         'DiabetesPedigreeFunction': 'Beta'},
                inplace = True)
df_copia

Unnamed: 0,Pregnancies,Glucosa,BloodPressure,SkinThickness,Insulin,IMC,Beta,Edad,Target
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### rename(index = ): renombrar indexación

In [299]:
df_copia.rename(index = {'Propiedad 0': 'Laura Ramirez'}, inplace = True)
df_copia

Unnamed: 0,Pregnancies,Glucosa,BloodPressure,SkinThickness,Insulin,IMC,Beta,Edad,Target
Laura Ramirez,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### columns

In [300]:
df_copia.columns  = ['Embarazo', 'Glucosa', 'Presión', 'Grosor_piel', 'Insulina', 'IMC', 'Beta', 'Edad','Target']

In [301]:
df_copia

Unnamed: 0,Embarazo,Glucosa,Presión,Grosor_piel,Insulina,IMC,Beta,Edad,Target
Laura Ramirez,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### drop(labels = [ ] , axis = 1): Eliminar columnas

In [302]:
df_copia.drop(labels = ['Embarazo','Presión'], axis = 1, inplace= True)
df_copia

Unnamed: 0,Glucosa,Grosor_piel,Insulina,IMC,Beta,Edad,Target
Laura Ramirez,148,35,0,33.6,0.627,50,1
Propiedad 1,85,29,0,26.6,0.351,31,0
Propiedad 2,183,0,0,23.3,0.672,32,1
Propiedad 3,89,23,94,28.1,0.167,21,0
Propiedad 4,137,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...
Propiedad 763,101,48,180,32.9,0.171,63,0
Propiedad 764,122,27,0,36.8,0.340,27,0
Propiedad 765,121,23,112,26.2,0.245,30,0
Propiedad 766,126,0,0,30.1,0.349,47,1


### drop(labels = [ ] , axis = 0): Eliminar filas

In [303]:
df_copia.drop(labels = ['Laura Ramirez', 'Propiedad 4'], axis = 0, inplace = True)
df_copia

Unnamed: 0,Glucosa,Grosor_piel,Insulina,IMC,Beta,Edad,Target
Propiedad 1,85,29,0,26.6,0.351,31,0
Propiedad 2,183,0,0,23.3,0.672,32,1
Propiedad 3,89,23,94,28.1,0.167,21,0
Propiedad 5,116,0,0,25.6,0.201,30,0
Propiedad 6,78,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...
Propiedad 763,101,48,180,32.9,0.171,63,0
Propiedad 764,122,27,0,36.8,0.340,27,0
Propiedad 765,121,23,112,26.2,0.245,30,0
Propiedad 766,126,0,0,30.1,0.349,47,1


### unique and value_counts

In [313]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


In [334]:
list(df['Outcome'].unique())

[1, 0]

In [335]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Datos Faltantes

In [348]:
import pandas as pd
df = pd.read_csv('data/Model_creditoPersonal.csv')
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [349]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [350]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB
