# Numpy

Es un módulo que nos ayudará a trabajar con arrays (como listas), per más rápidas de procesar

In [1]:
import numpy as np

## Crear un array a partir de una lista

In [2]:
a = np.array([1,2,3,4])
print(a)

[1 2 3 4]


In [3]:
type(a)

numpy.ndarray

## Crear un array a partir de una tupla

In [4]:
b = np.array((1,2,3))
print(b)

[1 2 3]


## Dimensiones de un array

In [8]:
unidimensional = np.array([1,2,3,4,5])
unidimensional

array([1, 2, 3, 4, 5])

In [9]:
unidimensional.ndim

1

In [10]:
bidimensional = np.array([[1,2,3],
                          [4,5,6]])
bidimensional

array([[1, 2, 3],
       [4, 5, 6]])

In [11]:
bidimensional.ndim

2

In [12]:
tridimensional = np.array([[[1,2,3], 
                            [4,5,6]], 
                           
                           [[1,2,3],
                            [4,5,6]] ])
tridimensional

array([[[1, 2, 3],
        [4, 5, 6]],

       [[1, 2, 3],
        [4, 5, 6]]])

In [13]:
tridimensional.ndim

3

## Acceder a los elementos de un array

In [14]:
unidimensional[2]

3

In [19]:
bidimensional[0][2], bidimensional[0,2]

(3, 3)

In [20]:
tridimensional[1,0,2]

3

## filtrar arrays

In [21]:
tridimensional

array([[[1, 2, 3],
        [4, 5, 6]],

       [[1, 2, 3],
        [4, 5, 6]]])

In [23]:
tridimensional[tridimensional%2 == 0]

array([2, 4, 6, 2, 4, 6])

## shape: número de elementos de cada dimensión

In [24]:
unidimensional

array([1, 2, 3, 4, 5])

In [25]:
unidimensional.shape

(5,)

In [26]:
bidimensional

array([[1, 2, 3],
       [4, 5, 6]])

In [27]:
bidimensional.shape

(2, 3)

In [28]:
tridimensional

array([[[1, 2, 3],
        [4, 5, 6]],

       [[1, 2, 3],
        [4, 5, 6]]])

In [31]:
tridimensional.shape

(2, 2, 3)

In [39]:
tridimensional.reshape(3,2,2)

array([[[1, 2],
        [3, 4]],

       [[5, 6],
        [1, 2]],

       [[3, 4],
        [5, 6]]])

## Operaciones

In [41]:
a = np.array([2,4,5,1])
b = np.array([1,3,2,5])

c = [2,4,5,1]
d = [1,3,2,5]

In [42]:
a + b

array([3, 7, 7, 6])

In [43]:
c + d

[2, 4, 5, 1, 1, 3, 2, 5]

In [44]:
a - b

array([ 1,  1,  3, -4])

In [45]:
a * b

array([ 2, 12, 10,  5])

In [46]:
a/ b

array([2.        , 1.33333333, 2.5       , 0.2       ])

## Ejemplito


Realizar una función que tenga como salida la probabilidad ($P_i$) y como parámetros $x$ y $\beta$ 

$$
P_i = P(x_i) = \frac{1}{1+e^{-\displaystyle\sum_{j=0}^{k}\beta_j \cdot x_{ij}}}
$$

La probabilidad que estamos calculando es la probabilidad para cada observación dentro deun registro (i : filas) 

Partiremos del siguiente dataframe, para evaluar posteriormente

|x_1|x_2|
|-|-|
|1|5|
|1|4|
|5|1|

y $\beta = \begin{bmatrix} 
	5  \\
	4  \\
	\end{bmatrix}$

Entonces Para $P_0$ tendríamos lo siguiente:
$x_{0j} = \begin{bmatrix} 
	1 & 5 \\
	\end{bmatrix}$
$$
P_0 = P(x_0) = \frac{1}{1+e^{-\displaystyle\sum^{1}_{j=0}\beta_j \cdot x_{0j} }}\\
P_0 = \displaystyle\frac{1}{1+e^{-1(\beta_0 \cdot x_{00} + \beta_1 \cdot x_{01}) } } = \displaystyle\frac{1}{1+e^{-(5*1 + 4*5)} }\\
P_0 = \frac{1}{1+e^{-25}} = 0.999999999986112
$$

Entonces Para $P_1$ tendríamos lo siguiente:
$x_{1j} = \begin{bmatrix} 
	1 & 4 \\
	\end{bmatrix}$
$$
P_1 = P(x_1) = \frac{1}{1+e^{-\displaystyle\sum^{1}_{j=0}\beta_j \cdot x_{0j} }}\\
P_1 = \displaystyle\frac{1}{1+e^{-1(\beta_0 \cdot x_{10} + \beta_1 \cdot x_{11}) } } = \displaystyle\frac{1}{1+e^{-(5*1 + 4*4)} }\\
P_1 = \frac{1}{1+e^{-21}} = 0.9999999992417439
$$

Entonces Para $P_2$ tendríamos lo siguiente:
$x_{2j} = \begin{bmatrix} 
	5 & 1 \\
	\end{bmatrix}$
$$
P_2 = P(x_2) = \frac{1}{1+e^{-\displaystyle\sum^{1}_{j=0}\beta_j \cdot x_{0j} }}\\
P_2 = \displaystyle\frac{1}{1+e^{-1(\beta_0 \cdot x_{10} + \beta_1 \cdot x_{11}) } } = \displaystyle\frac{1}{1+e^{-(5*5 + 4*1)} }\\
P_2 = \frac{1}{1+e^{-29}} = 0.9999999999997455
$$

In [68]:
def probabilidad(X, beta):
    import numpy as np
    n_filas = np.shape(X)[0] # extraemos el número de filas del registro X 
    n_columnas = np.shape(X)[1] #extraemos el número de columnas del registro X
    pi = list(range(1,n_filas+1))
    expon = list(range(1,n_filas+1))
    for i in range(n_filas):
        expon[i] = 0 
        for j in range(n_columnas):
            ex = X[i][j] * beta[j]
            expon[i] = ex + expon[i]
            pi[i] = 1/(1+np.exp(-expon[i]))
            
    return pi

In [69]:
matriz = np.array([[1,5],
                   [1,4],
                   [5,1]])
beta = np.array([5,4])

In [70]:
probabilidad(matriz, beta) 

[0.999999999986112, 0.9999999992417439, 0.9999999999997455]

# Dataframes
**Dataframe:** Estructura bidimensional donde las filas son registros y las columnas son las variables

In [71]:
import pandas as pd

## Convertir de diccionario a dataframe

In [73]:
diccionario = {'variable_1': [20,30,40,10,50],
               'variable_2': [2,4,5,6,7]}
df_1 = pd.DataFrame(diccionario)
df_1

Unnamed: 0,variable_1,variable_2
0,20,2
1,30,4
2,40,5
3,10,6
4,50,7


## Covnertir de lista a dataframe

In [74]:
nombre_columnas = ['ID','Precio','Cantidad']
matriz = [['XC01', 2000,3],
          ['XC02', 3000, 2],
          ['XC03', 10000,4]]

In [88]:
df_2 = pd.DataFrame(data = matriz,
                    columns = nombre_columnas,
                    index = ['Propiedad ' + str(i+1) for i in range(df_2.shape[0])])
df_2

Unnamed: 0,ID,Precio,Cantidad
Propiedad 1,XC01,2000,3
Propiedad 2,XC02,3000,2
Propiedad 3,XC03,10000,4


In [78]:
df_2.shape[0]

3

### Usar la función zip()

In [89]:
nombre_columnas = ['ID','Precio','Cantidad']
variable_1 = ['XC01','XC02','XC03']
variable_2 = [2000,3000,10000]
variable_3 = [3,2,4]

In [90]:
matriz_completa = list(zip(variable_1, variable_2, variable_3))

In [91]:
df_zip = pd.DataFrame(data = matriz_completa,
                     columns = nombre_columnas) 

In [92]:
df_zip

Unnamed: 0,ID,Precio,Cantidad
0,XC01,2000,3
1,XC02,3000,2
2,XC03,10000,4


## Mmodificar indexación

In [93]:
df_1

Unnamed: 0,variable_1,variable_2
0,20,2
1,30,4
2,40,5
3,10,6
4,50,7


In [96]:
df_1.index = ['Propiedad ' + str(i+1) for i in range(df_1.shape[0])]

In [97]:
df_1

Unnamed: 0,variable_1,variable_2
Propiedad 1,20,2
Propiedad 2,30,4
Propiedad 3,40,5
Propiedad 4,10,6
Propiedad 5,50,7


## Manipulación de columnas
* Mencionando el nombre de la columnas ``df['nombre_columna']``
* usando el método ``.columns[]``
* Usando el método ``.loc[]`` $\longrightarrow$ Hacer el llamado por nombre o etiqueta
* Usando el método ``.iloc[]`` $\longrightarrow$ HAcer el llamado por posición

In [106]:
import pandas as pd

df =  pd.read_csv('data/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [109]:
df[['Age']]

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


### Uso de columns

In [111]:
list(df.columns)

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [114]:
df[df.columns[[5]]]

Unnamed: 0,BMI
0,33.6
1,26.6
2,23.3
3,28.1
4,43.1
...,...
763,32.9
764,36.8
765,26.2
766,30.1


In [115]:
df[df.columns[[5,3,0]]]

Unnamed: 0,BMI,SkinThickness,Pregnancies
0,33.6,35,6
1,26.6,29,1
2,23.3,0,8
3,28.1,23,1
4,43.1,35,0
...,...,...,...
763,32.9,48,10
764,36.8,27,2
765,26.2,23,5
766,30.1,0,1


### Usando loc

``df.loc[rango de las filas, rango de las columnas(mombre)]``

In [117]:
df.loc[:,['Age']]

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


In [118]:
df.loc[:,['Age','BMI']]

Unnamed: 0,Age,BMI
0,50,33.6
1,31,26.6
2,32,23.3
3,21,28.1
4,33,43.1
...,...,...
763,63,32.9
764,27,36.8
765,30,26.2
766,47,30.1


In [119]:
df.loc[1:5,['Age','BMI']]

Unnamed: 0,Age,BMI
1,31,26.6
2,32,23.3
3,21,28.1
4,33,43.1
5,30,25.6


In [122]:
df.loc[10:20, :]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
10,4,110,92,0,0,37.6,0.191,30,0
11,10,168,74,0,0,38.0,0.537,34,1
12,10,139,80,0,0,27.1,1.441,57,0
13,1,189,60,23,846,30.1,0.398,59,1
14,5,166,72,19,175,25.8,0.587,51,1
15,7,100,0,0,0,30.0,0.484,32,1
16,0,118,84,47,230,45.8,0.551,31,1
17,7,107,74,0,0,29.6,0.254,31,1
18,1,103,30,38,83,43.3,0.183,33,0
19,1,115,70,30,96,34.6,0.529,32,1


In [124]:
df.loc[[10],:]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
10,4,110,92,0,0,37.6,0.191,30,0


In [127]:
df.loc[[10], ['BMI','Age']]

Unnamed: 0,BMI,Age
10,37.6,30


### Uso de iloc

In [129]:
df.iloc[:,[7]]

Unnamed: 0,Age
0,50
1,31
2,32
3,21
4,33
...,...
763,63
764,27
765,30
766,47


In [130]:
df.iloc[:,[7,8]]

Unnamed: 0,Age,Outcome
0,50,1
1,31,0
2,32,1
3,21,0
4,33,1
...,...,...
763,63,0
764,27,0
765,30,0
766,47,1


In [131]:
df.iloc[1:5,[7,8]]

Unnamed: 0,Age,Outcome
1,31,0
2,32,1
3,21,0
4,33,1


In [132]:
df.iloc[[10],[7,8]]

Unnamed: 0,Age,Outcome
10,30,0


In [137]:
list(zip(df.index,df.columns))

[(0, 'Pregnancies'),
 (1, 'Glucose'),
 (2, 'BloodPressure'),
 (3, 'SkinThickness'),
 (4, 'Insulin'),
 (5, 'BMI'),
 (6, 'DiabetesPedigreeFunction'),
 (7, 'Age'),
 (8, 'Outcome')]

## Manipulación de filas

* Usando el método ``loc[]``
* Usando el método ``iloc[]``

In [140]:
url = 'https://raw.githubusercontent.com/carlosrondanp/Python-NaylampUNI/main/data/diabetes.csv'
df = pd.read_csv(url)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [141]:
df.index = [ 'Propiedad '+str(i) for i in range(df.shape[0])]

In [142]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### Usando Loc

In [145]:
df.loc[['Propiedad 763'], :]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0


### Usando iloc

In [149]:
df.iloc[[763],:]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0


## Extraer elemento

In [150]:
df.loc['Propiedad 763', 'Age']

63

In [151]:
df.iloc[763,7]

63

In [153]:
df.loc[['Propiedad 763','Propiedad 1'], ['Age']]

Unnamed: 0,Age
Propiedad 763,63
Propiedad 1,31


## Métodos en dataframes

### Shape: tamaño de un dataframe

In [154]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


In [156]:
df.shape

(768, 9)

### copy: Duplicar un dataframe

In [158]:
df_copia = df.copy()

In [159]:
df_copia

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


### rename (columns = ): renombrar columnas

In [168]:
df_copia = df_copia.rename(columns = {'Glucose': 'Glucosa',
                           'BMI': 'IMC',
                           'Age': 'Edad',
                           'Outcome': 'Target',
                           'DiabetesPedigreeFunction':'Beta'})

### rename(index = ): renombrar indexación

In [169]:
df_copia.rename(index = {'Propiedad 0': 'Laura Rojas'}, inplace = True)

In [170]:
df_copia

Unnamed: 0,Pregnancies,Glucosa,BloodPressure,SkinThickness,Insulin,IMC,Beta,Edad,Target
Laura Rojas,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


## drop(labels = [ ] , axis = 0): Eliminar filas

In [171]:
df_copia.drop(labels = ['Laura Rojas','Propiedad 764'], axis = 0, inplace = True)

In [172]:
df_copia

Unnamed: 0,Pregnancies,Glucosa,BloodPressure,SkinThickness,Insulin,IMC,Beta,Edad,Target
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
Propiedad 5,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
Propiedad 762,9,89,62,0,0,22.5,0.142,33,0
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


## drop(labels = [ ], axis = 1): eliminar columnas

In [174]:
df_copia.drop(labels = ['Glucosa', 'Beta'], axis = 1, inplace = True)

In [179]:
df_copia

Unnamed: 0,Pregnancies,BloodPressure,SkinThickness,Insulin,IMC,Edad,Target
Propiedad 1,1,66,29,0,26.6,31,0
Propiedad 2,8,64,0,0,23.3,32,1
Propiedad 3,1,66,23,94,28.1,21,0
Propiedad 4,0,40,35,168,43.1,33,1
Propiedad 5,5,74,0,0,25.6,30,0
...,...,...,...,...,...,...,...
Propiedad 762,9,62,0,0,22.5,33,0
Propiedad 763,10,76,48,180,32.9,63,0
Propiedad 765,5,72,23,112,26.2,30,0
Propiedad 766,1,60,0,0,30.1,47,1


## unique(), value_counts()

In [181]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


In [182]:
print(df['Pregnancies'].unique())

[ 6  1  8  0  5  3 10  2  4  7  9 11 13 15 17 12 14]


In [185]:
df[['Pregnancies']].value_counts()

Pregnancies
1              135
0              111
2              103
3               75
4               68
5               57
6               50
7               45
8               38
9               28
10              24
11              11
13              10
12               9
14               2
15               1
17               1
dtype: int64

### apply

In [186]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,148,72,35,0,33.6,0.627,50,1
Propiedad 1,1,85,66,29,0,26.6,0.351,31,0
Propiedad 2,8,183,64,0,0,23.3,0.672,32,1
Propiedad 3,1,89,66,23,94,28.1,0.167,21,0
Propiedad 4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,101,76,48,180,32.9,0.171,63,0
Propiedad 764,2,122,70,27,0,36.8,0.340,27,0
Propiedad 765,5,121,72,23,112,26.2,0.245,30,0
Propiedad 766,1,126,60,0,0,30.1,0.349,47,1


In [188]:
df['Glucose'] = df['Glucose'].apply(lambda i: i**(1/2))

In [189]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,12.165525,72,35,0,33.6,0.627,50,1
Propiedad 1,1,9.219544,66,29,0,26.6,0.351,31,0
Propiedad 2,8,13.527749,64,0,0,23.3,0.672,32,1
Propiedad 3,1,9.433981,66,23,94,28.1,0.167,21,0
Propiedad 4,0,11.704700,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,10.049876,76,48,180,32.9,0.171,63,0
Propiedad 764,2,11.045361,70,27,0,36.8,0.340,27,0
Propiedad 765,5,11.000000,72,23,112,26.2,0.245,30,0
Propiedad 766,1,11.224972,60,0,0,30.1,0.349,47,1


### Groupby

In [195]:
df.groupby(['Outcome']).mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,10.391761,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,11.775727,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [196]:
df.groupby(['Outcome','Pregnancies']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Pregnancies,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,10.523854,69.205479,21.054795,77.561644,31.727397,0.457055,27.09589
0,1,10.00997,66.830189,23.04717,84.320755,29.616038,0.451679,25.254717
0,2,10.207918,61.940476,20.107143,72.619048,29.679762,0.479881,25.892857
0,3,10.400846,65.708333,17.520833,62.020833,29.23125,0.358354,28.770833
0,4,10.777873,71.577778,18.422222,78.466667,31.255556,0.410511,30.066667
0,5,10.498124,74.666667,17.166667,46.861111,31.1,0.359278,39.416667
0,6,10.674163,66.382353,18.705882,69.029412,29.591176,0.433294,37.147059
0,7,10.923481,70.35,19.35,72.5,29.975,0.4055,42.5
0,8,10.243505,75.3125,12.9375,14.5,30.69375,0.52675,49.625
0,9,10.247015,70.4,22.4,71.2,28.84,0.3111,46.0


Podemos usar métodos como: ``mean()``, ``meadian()``, ``sum()``, ``min()``, ``max()``, ``count()`` 

Usaremos el método ``.agg()`` 

In [197]:
df.groupby(['Outcome']).agg(np.mean)

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,10.391761,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,11.775727,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [205]:
df.groupby(['Outcome','Pregnancies']).agg([np.min, np.max, np.mean, np.std])[['Glucose','Age']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Glucose,Glucose,Glucose,Glucose,Age,Age,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,amax,mean,std,amin,amax,mean,std
Outcome,Pregnancies,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,0,7.549834,13.152946,10.523854,1.100127,21,67,27.09589,10.276949
0,1,0.0,13.892444,10.00997,2.023324,21,62,25.254717,6.253473
0,2,7.483315,13.228757,10.207918,1.012367,21,72,25.892857,7.783235
0,3,7.81025,13.820275,10.400846,1.207028,21,63,28.770833,8.817873
0,4,8.717798,14.035669,10.777873,1.193596,21,63,30.066667,7.16494
0,5,6.63325,12.569805,10.498124,1.223792,24,69,39.416667,13.795185
0,6,8.944272,13.527749,10.674163,1.207507,23,66,37.147059,11.962397
0,7,7.874008,13.379088,10.923481,1.328859,24,61,42.5,9.023361
0,8,8.062258,13.928388,10.243505,1.34486,34,68,49.625,10.990147
0,9,7.549834,12.409674,10.247015,1.490224,33,81,46.0,14.251706


In [206]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Propiedad 0,6,12.165525,72,35,0,33.6,0.627,50,1
Propiedad 1,1,9.219544,66,29,0,26.6,0.351,31,0
Propiedad 2,8,13.527749,64,0,0,23.3,0.672,32,1
Propiedad 3,1,9.433981,66,23,94,28.1,0.167,21,0
Propiedad 4,0,11.704700,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
Propiedad 763,10,10.049876,76,48,180,32.9,0.171,63,0
Propiedad 764,2,11.045361,70,27,0,36.8,0.340,27,0
Propiedad 765,5,11.000000,72,23,112,26.2,0.245,30,0
Propiedad 766,1,11.224972,60,0,0,30.1,0.349,47,1


In [204]:
df.groupby(['Outcome']).transform(lambda x: (x-x.mean())/x.std())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Propiedad 0,0.303196,0.241770,0.054689,0.726020,-0.723458,-0.212384,0.205449,1.179115
Propiedad 1,-0.761637,-0.829862,-0.120910,0.627000,-0.695815,-0.481700,-0.263249,-0.016284
Propiedad 2,0.837778,1.086681,-0.317545,-1.253651,-0.723458,-1.630537,0.326302,-0.461985
Propiedad 3,-0.761637,-0.678053,-0.120910,0.224044,0.254973,-0.286637,-0.878458,-0.873355
Propiedad 4,-1.300551,-0.044054,-1.434250,0.726020,0.487884,1.095621,4.666252,-0.370812
...,...,...,...,...,...,...,...,...
Propiedad 763,2.221276,-0.242035,0.432706,1.903029,1.124844,0.337562,-0.865084,2.726341
Propiedad 764,-0.430202,0.462711,0.100537,0.492681,-0.695815,0.844723,-0.300028,-0.359112
Propiedad 765,0.564102,0.430598,0.211260,0.224044,0.437039,-0.533716,-0.617663,-0.101991
Propiedad 766,-1.033260,-0.341603,-0.503663,-1.253651,-0.723458,-0.694281,-0.541151,0.905599
