# Cleaning item-specific data

In [136]:
import pandas as pd
from sqlalchemy import create_engine


driver = 'mysql+pymysql:'
user = 'adria'
password = '00000'
ip = '35.187.114.125'
database = 'vimet'

connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

In [137]:
items = pd.read_sql('items', con = engine)

In [138]:
items.head()

Unnamed: 0,index,Name,Lineitem quantity,Lineitem name,Lineitem price,Lineitem fulfillment status,Cancelled at,Refunded Amount,Vendor,Tags,Lineitem discount
0,0,#1248,4,Plátano Canarias - 3/4 unidades,1.95,fulfilled,,0.0,Fruites i Verdures Rovira,,0
1,1,#1248,2,Fresón Maresme - 1 caja (500 grs.),3.5,fulfilled,,,Fruites i Verdures Rovira,,0
2,2,#1248,1,Tomate Cherry - 200 grs,1.3,fulfilled,,,Fruites i Verdures Rovira,,0
3,3,#1248,1,Nueces California sin cáscara - 200 grs,3.44,fulfilled,,,Llegums Porta Novau,,0
4,4,#1248,3,Mandarina Orri - 500 grs.,2.0,fulfilled,,,Fruites i Verdures Rovira,,0


We'll drop the index column as it is redundant and the 'id' (which corresponds to the order and it's redundant with 'Name'

In [139]:
items = items.drop(columns='index')

In [140]:
items.head()

Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem price,Lineitem fulfillment status,Cancelled at,Refunded Amount,Vendor,Tags,Lineitem discount
0,#1248,4,Plátano Canarias - 3/4 unidades,1.95,fulfilled,,0.0,Fruites i Verdures Rovira,,0
1,#1248,2,Fresón Maresme - 1 caja (500 grs.),3.5,fulfilled,,,Fruites i Verdures Rovira,,0
2,#1248,1,Tomate Cherry - 200 grs,1.3,fulfilled,,,Fruites i Verdures Rovira,,0
3,#1248,1,Nueces California sin cáscara - 200 grs,3.44,fulfilled,,,Llegums Porta Novau,,0
4,#1248,3,Mandarina Orri - 500 grs.,2.0,fulfilled,,,Fruites i Verdures Rovira,,0


## NaN values

In [141]:
items.isna().sum()

Name                              0
Lineitem quantity                 0
Lineitem name                     0
Lineitem price                    0
Lineitem fulfillment status       0
Cancelled at                   3052
Refunded Amount                2874
Vendor                            1
Tags                           3094
Lineitem discount                 0
dtype: int64

In [142]:
items.shape

(3120, 10)

We will drop 'Tags' as it does not provide any information

In [143]:
items = items.drop(columns='Tags')

We will fill the empty refunded amounts with 0. And Cancelled at with 'not cancelled'

In [144]:
items['Cancelled at'] =items['Cancelled at'].fillna('not cancelled')

In [145]:
items['Refunded Amount'] =items['Refunded Amount'].fillna(0)

There is only one Nan value in vendor. We check what's going on.

In [146]:
items[items['Vendor'].isna()]

Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem price,Lineitem fulfillment status,Cancelled at,Refunded Amount,Vendor,Lineitem discount
976,#1201,1,Pechuga de pollo fileteada,3.15,fulfilled,not cancelled,0.0,,0


In [147]:
items[items['Lineitem name'].map(lambda x: 'pollo' in x)]

Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem price,Lineitem fulfillment status,Cancelled at,Refunded Amount,Vendor,Lineitem discount
24,#1247,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
48,#1246,1,Hamburguesas Gourmet de pollo lisa - 3 unidades,4.80,fulfilled,not cancelled,0.0,El Pagés,0
69,#1245,1,Brochetas de pechuga de pollo a las finas hier...,4.40,fulfilled,not cancelled,0.0,El Pagés,0
95,#1244,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
141,#1240,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
182,#1239,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
197,#1238,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
220,#1237,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
238,#1236,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0
255,#1235,1,Pechuga de pollo fileteada (350 grs.),3.15,fulfilled,not cancelled,0.0,El Pagés,0


We can see that the usual vendor for this type of items is 'El Pagés' therefore, we will fill the Nan with this value

In [148]:
items['Vendor'] = items['Vendor'].fillna('El Pagés')

## Variable types

In [149]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3120 entries, 0 to 3119
Data columns (total 9 columns):
Name                           3120 non-null object
Lineitem quantity              3120 non-null int64
Lineitem name                  3120 non-null object
Lineitem price                 3120 non-null float64
Lineitem fulfillment status    3120 non-null object
Cancelled at                   3120 non-null object
Refunded Amount                3120 non-null float64
Vendor                         3120 non-null object
Lineitem discount              3120 non-null int64
dtypes: float64(2), int64(2), object(5)
memory usage: 219.5+ KB


Types seem ok

## Feature values

In [150]:
items.columns

Index(['Name', 'Lineitem quantity', 'Lineitem name', 'Lineitem price',
       'Lineitem fulfillment status', 'Cancelled at', 'Refunded Amount',
       'Vendor', 'Lineitem discount'],
      dtype='object')

#### Quantity -- OK

In [151]:
items['Lineitem quantity'].value_counts()

1     2793
2      250
3       40
4       22
6        4
10       3
16       2
12       2
8        2
5        1
30       1
Name: Lineitem quantity, dtype: int64

Looks good

#### Lineitem name'

In [152]:
items['Lineitem name'].value_counts()

Huevos - 1/2 docena                                     41
Brócoli - 1 unidad                                      40
Pechuga pollo fileteada - 1 pechuga (300 gr)            39
Pera Conference - 4 unidades                            39
Plátano Canarias Verde - 3/4 unidades                   38
Lote de productos básicos - Donativo Banc d'Aliments    38
Zanahorias - medio kilo                                 36
Plátano canario (verde) - 0.5 kg. (3-4 unidades)        35
Almendra Cruda - 125 grs.                               33
Huevos ecológicos (6 unidades)                          33
Manzana Golden - 4 unidades                             33
Nueces California sin cáscara - 125 grs.                32
Calabacín - 500 grs.                                    29
Tomate Monterosa - 500 grs.                             28
Cebolla Reca para guisar (1 Kg. = 3/4 uds.)             27
Calabacines - 500 grs. (2-3 unidades)                   26
Fresón Maresme - 1 caja (500 grs.)                      

After analysing all the patterns we could find, we simplify the description of each item and create a tag list. Below, there's an example of the serch for patterns. We will search for 'pollo' and then the other registers will be changed to others as there is no item 'fruta'

In [153]:
items['Lineitem name'][items['Lineitem name'].map(lambda x: x.startswith('Brocheta'))]

69      Brochetas de pechuga de pollo a las finas hier...
998     Brochetas de pechuga de pollo barbacoa - 5 uni...
1010    Brochetas de pechuga de pollo barbacoa - 5 uni...
2519                       Brocheta de fruta - 3 unidades
Name: Lineitem name, dtype: object

In [154]:
items['Lineitem name'][items['Lineitem name'].map(lambda x: x.startswith('Virutas'))]

1043    Virutas de Jamón de bellota - 150 grs.
Name: Lineitem name, dtype: object

This contains too much information, we will split the content and create tags. The rest of information will go in 'Notes'.

In [161]:
# We will reduce the information contained in the item description by creating a tag with the main food item.

# Create a list with all the items but reducing the ones with 'pollo' and 'pavo' as they are quite recurent and varied
mylst = list(items['Lineitem name'].map(lambda x: 'Pollo' if x.find('pollo') > 0 else ('Pavo' if x.find('pavo') > 0 else x)))

# We now take the fist word of the list as it will describe the main food item for 95% of the items.
tags = []
for tag in mylst:
    tags.append(tag.split(' ')[0])

# Now we solve the special cases
for index, tag in enumerate(tags):
    if tag == 'Cabezas':
        tags[index] = 'Ajo'
    elif tag == 'Cabeza':
        tags[index] = 'Ajo'        
    elif tag == 'Rulo':
        tags[index] = 'Queso'
    elif tag == 'Hamburguesa':
        tags[index] = 'Ternera'
    elif tag == 'Manchego':
        tags[index] = 'Queso'
    elif tag == 'Medio':
        tags[index] = 'Conejo'
    elif tag == 'Brie':
        tags[index] = 'Queso'
    elif tag == 'Emmental':
        tags[index] = 'Queso'
    elif tag == 'Cammenbert':
        tags[index] = 'Queso'
    elif tag == 'Comté':
        tags[index] = 'Queso'
    elif tag == 'Colitas':
        tags[index] = 'Rape'
    elif tag == 'Copos':
        tags[index] = 'Avena'
    elif tag == 'Tapa':
        tags[index] = 'Ternera'
    elif tag == 'Producto':
        tags[index] = 'Others'
    elif tag == 'Picos':
        tags[index] = 'Pan'
    elif tag == 'Tostaditas':
        tags[index] = 'Pan'
    elif tag == 'Barra':
        tags[index] = 'Pan'
    elif tag == '1':
        tags[index] = 'Pan'
    elif tag == 'Alcachofas':
        tags[index] = 'Alcachofa'
    elif tag == 'Azuqui':
        tags[index] = 'Azuki'
    elif tag == 'Baguet':
        tags[index] = 'Pan'
    elif tag == 'Berenjenas':
        tags[index] = 'Berenjena'
    elif tag == 'Broquil':
        tags[index] = 'Brócoli'
    elif tag == 'Calabacines':
        tags[index] = 'Calabacín'
    elif tag == 'Calamares':
        tags[index] = 'Calamar'
    elif tag == 'Ciruelas':
        tags[index] = 'Ciruela'
    elif tag == 'Codornices':
        tags[index] = 'Codorniz'
    elif tag == 'Coles':
        tags[index] = 'Col'
    elif tag == 'Conill':
        tags[index] = 'Conejo'
    elif tag == 'Cous':
        tags[index] = 'Cous-cous'
    elif tag == 'Esparragos':
        tags[index] = 'Esparrago'
    elif tag == 'Feta':
        tags[index] = 'Queso'
    elif tag == 'Filetes':
        tags[index] = 'Filet'
    elif tag == 'Fresones':
        tags[index] = 'Fresón'
    elif tag == 'Garbanzos':
        tags[index] = 'Garbanzo'
    elif tag == 'Idiazábal':
        tags[index] = 'Queso'
    elif tag == 'Judías':
        tags[index] = 'Judía'
    elif tag == 'Lentejas':
        tags[index] = 'Lenteja'
    elif tag == 'Libritos':
        tags[index] = 'Librito'
    elif tag == 'Manzanas':
        tags[index] = 'Manzana'
    elif tag == 'Mejillones':
        tags[index] = 'Mejillón'
    elif tag == 'Naranjas':
        tags[index] = 'Naranja'
    elif tag == 'Nueces':
        tags[index] = 'Nuez'
    elif tag == 'Pasas':
        tags[index] = 'Pasa'
    elif tag == 'Pimientos':
        tags[index] = 'Pimiento'
    elif tag == 'Puerros':
        tags[index] = 'Puerro'
    elif tag == 'Salmon':
        tags[index] = 'Salmón'
    elif tag == 'Tapaplana':
        tags[index] = 'Ternera'
    elif tag == 'Tomates':
        tags[index] = 'Tomate'
    elif tag == 'Zanahorias':
        tags[index] = 'Zanahoria'
    elif tag == 'Boloñesa':
        tags[index] = 'Macarrones'
    elif tag == 'Brocheta':
        tags[index] = 'Others'
    elif tag == 'Cabra':
        tags[index] = 'Queso'
    elif tag == 'Caprice':
        tags[index] = 'Queso'
    elif tag == 'Carne':
        tags[index] = 'Ternera'
    elif tag == 'Cola':
        tags[index] = 'Rape'
    elif tag == 'Fiambre':
        tags[index] = 'Pavo'
    elif tag == 'Flores':
        tags[index] = 'Queso'
    elif tag == 'Hamburg.':
        tags[index] = 'Others'
    elif tag == 'Hamburguesas':
        tags[index] = 'Ternera'
    elif tag == 'Lomo':
        tags[index] = 'Cerdo'
    elif tag == 'Lomos':
        tags[index] = 'Conejo'
    elif tag == 'Mediana':
        tags[index] = 'Conejo'
    elif tag == 'Mini-hamburguesas':
        tags[index] = 'Ternera'
    elif tag == 'Mini-harmburguesas':
        tags[index] = 'Ternera'
    elif tag == 'Zanahorias':
        tags[index] = 'Zanahoria'
    elif tag == 'Muslitos':
        tags[index] = 'Pollo'
    elif tag == 'Muslos':
        tags[index] = 'Pollo'
    elif tag == 'Oveja':
        tags[index] = 'Queso'
    elif tag == 'Pechuga':
        tags[index] = 'Pollo'
    elif tag == 'Pechugas':
        tags[index] = 'Pollo'
    elif tag == 'Tacos':
        tags[index] = 'Cerdo'
    elif tag == 'Tall':
        tags[index] = 'Ternera'
    elif tag == 'Bandeja':
        tags[index] = 'Pollo'
    elif tag == 'Chuleta':
        tags[index] = 'Ternera'
    elif tag == 'Culata':
        tags[index] = 'Ternera'
    elif tag == 'Espalda':
        tags[index] = 'Conejo'
    elif tag == 'Lote':
        tags[index] = 'Others'
    elif tag == 'Mitjana':
        tags[index] = 'Ternera'
    elif tag == 'Pepinos':
        tags[index] = 'Pepino'
    elif tag == 'Virutas':
        tags[index] = 'Jamón'
    elif tag == 'Ajos':
        tags[index] = 'Ajo'
    else:
        pass     

In [162]:
items['tags'] = tags

In [163]:
sorted(items['tags'].unique())

['Aceite',
 'Acelgas',
 'Agua',
 'Aguacate',
 'Ajo',
 'Albaricoque',
 'Albóndigas',
 'Alcachofa',
 'Almejas',
 'Almendra',
 'Alubia',
 'Anacardo',
 'Apio',
 'Arroz',
 'Arándanos',
 'Atun',
 'Avellana',
 'Avena',
 'Azuki',
 'Bananas',
 'Berberechos',
 'Berenjena',
 'Bonito',
 'Boquerones',
 'Brócoli',
 'Bull',
 'Butifarra',
 'Caballa',
 'Cacahuetes',
 'Calabacín',
 'Calabaza',
 'Calamar',
 'Caldo',
 'Calçots',
 'Canelones',
 'Canónigos',
 'Cebolla',
 'Cep',
 'Cerdo',
 'Cerezas',
 'Cesta',
 'Champiñón',
 'Chirimoya',
 'Chocolate',
 'Chorizo',
 'Ciruela',
 'Codorniz',
 'Cogollos',
 'Col',
 'Coliflor',
 'Conejo',
 'Confit',
 'Costillas',
 'Cous-cous',
 'Croquetas',
 'Dorada',
 'Dátiles',
 'Endivias',
 'Eneldo',
 'Ensalada',
 'Entrecot',
 'Escarola',
 'Espaguetis',
 'Esparrago',
 'Espectec',
 'Espinacas',
 'Filet',
 'Frambuesas',
 'Fresón',
 'Frijoles',
 'Fuet',
 'Gamba',
 'Garbanzo',
 'Girgola',
 'Granada',
 'Granola',
 'Higos',
 'Huevos',
 'Hummus',
 'Jamón',
 'Judía',
 'Kakis',
 'Kiwi',


In [23]:
items.groupby('tags').count()

Unnamed: 0_level_0,Name,Lineitem quantity,Lineitem name,Lineitem price,Lineitem fulfillment status,Cancelled at,Refunded Amount,Vendor,Lineitem discount,37,...,2113,2139,2165,2167,2212,2244,2245,2257,2442,2443
tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Aceite,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
Acelgas,28,28,28,28,28,28,28,28,28,28,...,28,28,28,28,28,28,28,28,28,28
Agua,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
Aguacate,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
Ajos,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
Albaricoque,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
Albóndigas,27,27,27,27,27,27,27,27,27,27,...,27,27,27,27,27,27,27,27,27,27
Alcachofa,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
Alcachofas,8,8,8,8,8,8,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8


#### 'Lineitem price' --OK

Looks good

#### 'Lineitem requires shipping' -- OK

In [None]:
items['Lineitem requires shipping'].value_counts()

####  'Lineitem taxable' -- OK

In [None]:
items['Lineitem taxable'].value_counts()

#### 'Lineitem fulfillment status' -- OK

In [None]:
items['Lineitem fulfillment status'].value_counts()

# Load data

In [None]:
from sqlalchemy import create_engine


driver = 'mysql+pymysql:'
user = 'adria'
password = '00000'
ip = '35.187.114.125'
database = 'vimet'

connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

In [None]:
items.to_sql('items', con = engine, if_exists='replace')