# Análisis Exploratorio con Python Pandas

### Importar librerías

In [63]:
import pandas as pd
import numpy as np

### Cargamos el dataset a explorar como DataFrame de Pandas. Si el dataset ya tiene una columna que pueda servir como indice ya la podemos asignar en lugar de la columna id que Pandas agrega automaticamente

In [64]:
df = pd.read_csv("shoes_dataset.csv", index_col="InvoiceNo")

### Para ir conociendo el dataset podemos utilizar las funciones de Pandas que nos permiten ver su tamaño, forma, columnas, y el tipo de datos de cada columna

In [65]:
print("Tamaño= ", df.size)
print("Forma= ", df.shape)
print("columnas= ", df.columns)
print("Tipos de dato=\n",df.dtypes)

Tamaño=  164637
Forma=  (14967, 11)
columnas=  Index(['Date', 'Country', 'ProductID', 'Shop', 'Gender', 'Size (US)',
       'Size (Europe)', 'Size (UK)', 'UnitPrice', 'Discount', 'SalePrice'],
      dtype='object')
Tipos de dato=
 Date              object
Country           object
ProductID          int64
Shop              object
Gender            object
Size (US)        float64
Size (Europe)     object
Size (UK)        float64
UnitPrice         object
Discount          object
SalePrice         object
dtype: object


### Otra forma de ver algo similar a lo que vimos arriba pero que tambien nos muetsra el uso de memoria y la cantidad de campos nulos por columna es la función .info()

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14967 entries, 52389 to 65777
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           14967 non-null  object 
 1   Country        14967 non-null  object 
 2   ProductID      14967 non-null  int64  
 3   Shop           14967 non-null  object 
 4   Gender         14967 non-null  object 
 5   Size (US)      14967 non-null  float64
 6   Size (Europe)  14967 non-null  object 
 7   Size (UK)      14967 non-null  float64
 8   UnitPrice      14967 non-null  object 
 9   Discount       14967 non-null  object 
 10  SalePrice      14967 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 1.4+ MB


### Con la función .head() podemos ver las primeras 5 filas del dataset (o mas si pasamos el número de filas que queremos ver como parámetro)

In [67]:
df.head()

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52389,1/1/2014,United Kingdom,2152,UK2,Male,11,44,10,$159.00,0%,$159.00
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,$199.00,20%,$159.20
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00
52393,1/1/2014,United Kingdom,2222,UK4,Female,9,39-40,7,$159.00,0%,$159.00


### Podemos tener una descripción estadística rápida utilizando la función .descrive(). Está función solo servirá para las columnas que tengan datos numéricos

In [68]:
df.describe()


Unnamed: 0,ProductID,Size (US),Size (UK)
count,14967,14967,14967
mean,2195,9,8
std,28,2,2
min,2147,4,2
25%,2172,8,6
50%,2195,9,8
75%,2219,10,10
max,2242,15,14


## Filtrado de filas y columnas. Funciones .loc e .iloc

## Función .loc

### Con la función .loc podemos filtrar utilizando el número de id y/o el nombre de la columna

In [69]:
# Por un número de id específico
df.loc[[52392]]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00


In [70]:
# Por un rángo de números de id
df.loc[52389:52393]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52389,1/1/2014,United Kingdom,2152,UK2,Male,11,44,10,$159.00,0%,$159.00
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,$199.00,20%,$159.20
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00
52393,1/1/2014,United Kingdom,2222,UK4,Female,9,39-40,7,$159.00,0%,$159.00


In [71]:
# Por un nombre de columna especifico (observar que como primer parámetro pasamos el rango completo de filas)
df.loc[:, ["UnitPrice"]]

Unnamed: 0_level_0,UnitPrice
InvoiceNo,Unnamed: 1_level_1
52389,$159.00
52390,$199.00
52391,$149.00
52392,$159.00
52393,$159.00
...,...
65773,$139.00
65774,$149.00
65775,$179.00
65776,$199.00


In [72]:
# Especificando los id mas las columna que nos interesan
# Ejemplo 1
df.loc[52389:52392, ["Gender","Size (Europe)", "UnitPrice"]]

Unnamed: 0_level_0,Gender,Size (Europe),UnitPrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52389,Male,44,$159.00
52390,Male,44-45,$199.00
52391,Male,42-43,$149.00
52392,Female,40,$159.00


In [73]:
# Ejemplo 2
df.loc[[52389, 52391] , ["Size (Europe)", "UnitPrice"]]

Unnamed: 0_level_0,Size (Europe),UnitPrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1
52389,44,$159.00
52391,42-43,$149.00


## Función .iloc

### Con la función .iloc pódemos hacer algo similar pero moviendonos por los números de indice que asigna pandas (no confundir con id)

In [74]:
# Elegimos un número concreto de fila
df.iloc[[5]]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52394,1/1/2014,United States,2173,US15,Male,10,43-44,10,$159.00,0%,$159.00


In [75]:
# Elegimos un rango de filas
df.iloc[2:5]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00
52393,1/1/2014,United Kingdom,2222,UK4,Female,9,39-40,7,$159.00,0%,$159.00


In [76]:
# Una lista de filas
df.iloc[[1,5,8,25]]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,$199.00,20%,$159.20
52394,1/1/2014,United States,2173,US15,Male,10,43-44,10,$159.00,0%,$159.00
52397,1/2/2014,United States,2191,US13,Male,10,43-44,10,$139.00,0%,$139.00
52409,1/4/2014,Germany,2157,GER2,Male,12,45,12,$149.00,20%,$119.20


In [77]:
# Columnas por indice (observar que como primer parámetro pasamos el rango completo de filas)
df.iloc[: , 0:3]

Unnamed: 0_level_0,Date,Country,ProductID
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52389,1/1/2014,United Kingdom,2152
52390,1/1/2014,United States,2230
52391,1/1/2014,Canada,2160
52392,1/1/2014,United States,2234
52393,1/1/2014,United Kingdom,2222
...,...,...,...
65773,12/31/2016,United Kingdom,2154
65774,12/31/2016,United States,2181
65775,12/31/2016,Canada,2203
65776,12/31/2016,Germany,2231


In [78]:
# Una lista de columnas (observar que como primer parámetro pasamos el rango completo de filas)
df.iloc[: , [1,3,5]]

Unnamed: 0_level_0,Country,Shop,Size (US)
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52389,United Kingdom,UK2,11
52390,United States,US15,12
52391,Canada,CAN7,10
52392,United States,US6,10
52393,United Kingdom,UK4,9
...,...,...,...
65773,United Kingdom,UK2,10
65774,United States,US12,12
65775,Canada,CAN6,10
65776,Germany,GER1,10


In [79]:
# Filas y columnas por rango
df.iloc[2:6 , 1:4]

Unnamed: 0_level_0,Country,ProductID,Shop
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52391,Canada,2160,CAN7
52392,United States,2234,US6
52393,United Kingdom,2222,UK4
52394,United States,2173,US15


In [80]:
# Listas de indiceas de filas y columnas 
df.iloc[[2,5,35] , [2,5,8]]

Unnamed: 0_level_0,ProductID,Size (US),UnitPrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52391,2160,10,$149.00
52394,2173,10,$159.00
52416,2235,10,$169.00


## Filtrado condicional

### Filtrando por condicones númericas

In [81]:
# FIltrando por una sola columna numerica
df[df["Size (US)"] > 9.0]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52389,1/1/2014,United Kingdom,2152,UK2,Male,11,44,10,$159.00,0%,$159.00
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,$199.00,20%,$159.20
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00
52394,1/1/2014,United States,2173,US15,Male,10,43-44,10,$159.00,0%,$159.00
...,...,...,...,...,...,...,...,...,...,...,...
65769,12/31/2016,Germany,2210,GER2,Male,10,43-44,10,$179.00,50%,$89.50
65773,12/31/2016,United Kingdom,2154,UK2,Male,10,42-43,9,$139.00,0%,$139.00
65774,12/31/2016,United States,2181,US12,Female,12,42-43,10,$149.00,0%,$149.00
65775,12/31/2016,Canada,2203,CAN6,Male,10,43-44,10,$179.00,30%,$125.30


In [82]:
# Poniendo mas de una condición numérica
df[(df["Size (US)"] > 9.0) & (df["Size (UK)"] < 10.5)]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00
52394,1/1/2014,United States,2173,US15,Male,10,43-44,10,$159.00,0%,$159.00
52396,1/2/2014,Canada,2238,CAN5,Male,10,43,10,$169.00,0%,$169.00
52397,1/2/2014,United States,2191,US13,Male,10,43-44,10,$139.00,0%,$139.00
...,...,...,...,...,...,...,...,...,...,...,...
65769,12/31/2016,Germany,2210,GER2,Male,10,43-44,10,$179.00,50%,$89.50
65773,12/31/2016,United Kingdom,2154,UK2,Male,10,42-43,9,$139.00,0%,$139.00
65774,12/31/2016,United States,2181,US12,Female,12,42-43,10,$149.00,0%,$149.00
65775,12/31/2016,Canada,2203,CAN6,Male,10,43-44,10,$179.00,30%,$125.30


### Filtrado por cadena de texto

In [83]:
# Con una sola condición
df[df["Country"].str.contains("Canada")]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52396,1/2/2014,Canada,2238,CAN5,Male,10,43,10,$169.00,0%,$169.00
52402,1/3/2014,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
52410,1/4/2014,Canada,2169,CAN3,Female,8,38-39,6,$129.00,0%,$129.00
52415,1/5/2014,Canada,2169,CAN3,Female,8,38-39,6,$129.00,0%,$129.00
...,...,...,...,...,...,...,...,...,...,...,...
65747,12/29/2016,Canada,2217,CAN7,Male,9,42,8,$199.00,0%,$199.00
65749,12/30/2016,Canada,2206,CAN6,Male,9,42,8,$169.00,20%,$135.20
65763,12/30/2016,Canada,2161,CAN5,Male,8,41,8,$129.00,0%,$129.00
65765,12/31/2016,Canada,2199,CAN5,Male,11,44,10,$139.00,0%,$139.00


In [84]:
# Con mas de una condición
# Ejemplo 1
df[(df["Country"].str.contains("Canada")) & (df["Size (Europe)"].str.contains("42-43"))]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52402,1/3/2014,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
52425,1/7/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52439,1/9/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52470,1/15/2014,Canada,2178,CAN5,Male,10,42-43,9,$139.00,10%,$125.10
...,...,...,...,...,...,...,...,...,...,...,...
65709,12/26/2016,Canada,2213,CAN5,Male,10,42-43,9,$129.00,50%,$64.50
65721,12/27/2016,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
65731,12/28/2016,Canada,2198,CAN6,Male,10,42-43,9,$179.00,10%,$161.10
65731,12/28/2016,Canada,2171,CAN3,Male,10,42-43,9,$139.00,0%,$139.00


In [85]:
# Ejemplo 2
df[(df["Country"].str.contains("Canada")) | (df["Size (Europe)"].str.contains("42-43"))]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52396,1/2/2014,Canada,2238,CAN5,Male,10,43,10,$169.00,0%,$169.00
52402,1/3/2014,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
52406,1/3/2014,United States,2147,US15,Male,10,42-43,9,$139.00,0%,$139.00
52410,1/4/2014,Canada,2169,CAN3,Female,8,38-39,6,$129.00,0%,$129.00
...,...,...,...,...,...,...,...,...,...,...,...
65766,12/31/2016,United Kingdom,2202,UK1,Male,10,42-43,9,$159.00,30%,$111.30
65767,12/31/2016,United States,2147,US15,Male,10,42-43,9,$139.00,0%,$139.00
65773,12/31/2016,United Kingdom,2154,UK2,Male,10,42-43,9,$139.00,0%,$139.00
65774,12/31/2016,United States,2181,US12,Female,12,42-43,10,$149.00,0%,$149.00


In [86]:
# Otro ejemplo utilizando mas de una condición por columna
df[(df["Country"].str.contains("Canada", "United Kingdom")) & (df["Size (Europe)"].str.contains("42-43", "43-44"))]

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52402,1/3/2014,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
52425,1/7/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52439,1/9/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20
52470,1/15/2014,Canada,2178,CAN5,Male,10,42-43,9,$139.00,10%,$125.10
...,...,...,...,...,...,...,...,...,...,...,...
65709,12/26/2016,Canada,2213,CAN5,Male,10,42-43,9,$129.00,50%,$64.50
65721,12/27/2016,Canada,2240,CAN6,Male,10,42-43,9,$199.00,30%,$139.30
65731,12/28/2016,Canada,2198,CAN6,Male,10,42-43,9,$179.00,10%,$161.10
65731,12/28/2016,Canada,2171,CAN3,Male,10,42-43,9,$139.00,0%,$139.00


## Agragar columnas al datagrame

### Algunas veces necesitamos agregar datos a partir de los que ya tenemos. Como ejemplo en nuestro caso voy a agregar una columna donde nos muestre el continente al que pertenece la tienda utilizandola columna "Country"

In [87]:
# Primero identifico los valores únicos de la columna Countrt
df["Country"].unique()

array(['United Kingdom', 'United States', 'Canada', 'Germany'],
      dtype=object)

In [88]:
# Segundo creo una función que identifique el continente al que pertenece cada país
def continents(Country):
    for continent in Country:
        if Country == "United Kingdom" or Country == "Germany":
            continent = "Europe"
        else:
            continent = "America"
        return continent

In [89]:
# Creamos la nueva columna aplicando la función y vemos como queda el dataframe
df["Continent"] = df["Country"].apply(continents)
df.head()

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice,Continent
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
52389,1/1/2014,United Kingdom,2152,UK2,Male,11,44,10,$159.00,0%,$159.00,Europe
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,$199.00,20%,$159.20,America
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,$149.00,20%,$119.20,America
52392,1/1/2014,United States,2234,US6,Female,10,40,8,$159.00,0%,$159.00,America
52393,1/1/2014,United Kingdom,2222,UK4,Female,9,39-40,7,$159.00,0%,$159.00,Europe


## Cambiar el tipo de dato de una columna 

### En este caso queremos cambiar el tipo de dato de las columnas "UnitPrice" y "SalePrice" de object a float64 para poder realizar operaciones

In [92]:
df["UnitPrice"] = df["UnitPrice"].str.replace("[\$]", "", regex=True).astype(float)
df["SalePrice"] = df["SalePrice"].str.replace("[\$]", "", regex=True).astype(float)

In [93]:
# Con la función .dtypes vemos que cambiaron de object a float64
df.dtypes

Date              object
Country           object
ProductID          int64
Shop              object
Gender            object
Size (US)        float64
Size (Europe)     object
Size (UK)        float64
UnitPrice        float64
Discount          object
SalePrice        float64
Continent         object
dtype: object

In [94]:
# Vemos como queda el dataframe
df.head()

Unnamed: 0_level_0,Date,Country,ProductID,Shop,Gender,Size (US),Size (Europe),Size (UK),UnitPrice,Discount,SalePrice,Continent
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
52389,1/1/2014,United Kingdom,2152,UK2,Male,11,44,10,159,0%,159,Europe
52390,1/1/2014,United States,2230,US15,Male,12,44-45,11,199,20%,159,America
52391,1/1/2014,Canada,2160,CAN7,Male,10,42-43,9,149,20%,119,America
52392,1/1/2014,United States,2234,US6,Female,10,40,8,159,0%,159,America
52393,1/1/2014,United Kingdom,2222,UK4,Female,9,39-40,7,159,0%,159,Europe


In [96]:
# Provmos la funcion .describe para ver si ahora nos toma el precio unitario y de venta
df.describe()

Unnamed: 0,ProductID,Size (US),Size (UK),UnitPrice,SalePrice
count,14967,14967,14967,14967,14967
mean,2195,9,8,164,144
std,28,2,2,23,35
min,2147,4,2,129,64
25%,2172,8,6,149,125
50%,2195,9,8,159,149
75%,2219,10,10,179,169
max,2242,15,14,199,199
