In [2]:
import pandas as pd
import numpy as np
from slugify import slugify
pd.set_option('display.max_columns', None)
from Levenshtein import distance

## I. Selecionar os dados: determinar quais conjuntos de dados serão utilizados e documentar os motivos de inclusão/exclusão.

In [3]:
df = pd.read_csv("../data/raw/iowa_liquor_train_test_split_sample.csv",dtype={
    'Invoice/Item Number': str,
    'Date':str,
    'Store Number': float,
    'Store Name': str,
    'Address': str,
    'City': str,
    'Zip Code': str,
    'Store Location': str,
    'County Number': float,
    'County': str,
    'Category': float,
    'Category Name': str,
    'Vendor Number': float,
    'Vendor Name': str,
    'Item Number': float,
    'Item Description': str,
    'Pack': float,
    'Bottle Volume (ml)': int,
    'State Bottle Cost': str,
    'State Bottle Retail': str,
    'Bottles Sold': int,
    'Sale (Dollars)': str,
    'Volume Sold (Liters)': float,
    'Volume Sold (Gallons)': float
})

In [4]:
df.columns = [slugify(col, lowercase=True, separator='_') for col in df.columns]
df.columns

Index(['unnamed_0', 'invoice_item_number', 'date', 'store_number',
       'store_name', 'address', 'city', 'zip_code', 'store_location',
       'county_number', 'county', 'category', 'category_name', 'vendor_number',
       'vendor_name', 'item_number', 'item_description', 'pack',
       'bottle_volume_ml', 'state_bottle_cost', 'state_bottle_retail',
       'bottles_sold', 'sale_dollars', 'volume_sold_liters',
       'volume_sold_gallons'],
      dtype='object')

df.head(1)

### Remoção das colunas as quais não iremos utilizar
#### - as colunas unnamed_0, invoice_item_number são inúteis para a nossa análise
#### - as colunas store_number, store_name não serão utilizadas na predição
#### - as colunas address, zip_code podem ser recuperadas por store_location
#### - o restante das colunas não será utilizado, por não ser necessário ou ter dados que as substituem

In [5]:
df = df.drop(columns=['unnamed_0', 'invoice_item_number','store_number','store_name','address','zip_code','county_number','category','vendor_number','vendor_name','item_number','pack','bottle_volume_ml','state_bottle_cost','bottles_sold','volume_sold_gallons'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518216 entries, 0 to 2518215
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 object 
 1   city                 object 
 2   store_location       object 
 3   county               object 
 4   category_name        object 
 5   item_description     object 
 6   state_bottle_retail  object 
 7   sale_dollars         object 
 8   volume_sold_liters   float64
dtypes: float64(1), object(8)
memory usage: 172.9+ MB


In [7]:
df.head()

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
0,02/26/2013,BLUE GRASS,"102 W MAYNE\nBLUE GRASS 52726\n(41.509114, -90...",Scott,STRAIGHT BOURBON WHISKIES,Wild Turkey 101 Round,$8.99,$8.99,0.38
1,08/22/2017,Indianola,1500 North Jefferson\nIndianola 50125\n,WARREN,Canadian Whiskies,Crown Royal Canadian Whisky,$47.99,$287.94,1.75
2,08/28/2013,STORM LAKE,"1250 N LAKE ST\nSTORM LAKE 50588\n(42.653594, ...",Buena Vista,DISTILLED SPIRITS SPECIALTY,Firefly Strawberry Moonshine,$14.82,$29.64,1.5
3,03/26/2013,MANNING,"316 MAIN ST\nMANNING 51455\n(41.908573, -95.06...",Carroll,IMPORTED SCHNAPPS,Rumple Minze Peppermint Schnapps Liqueur,$22.72,$22.72,1.0
4,08/17/2017,Des Moines,"218, 6th Ave #101\nDes Moines 50309\n(41.58568...",POLK,Mixto Tequila,Jose Cuervo Especial Reposado Tequila Mini,$17.25,$17.25,0.5


In [8]:
df.shape

(2518216, 9)

In [21]:
df[df['date'].isna()]

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters


In [22]:
df[df['city'].isna()]

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
1941,2017-02-06,,,,Mixto Tequila,Jose Cuervo Especial Reposado Tequila Mini,$17.25,$17.25,1.50
12580,2017-04-14,,,,American Brandies,Paul Masson Grande Amber Brandy,$4.83,$115.92,0.37
15953,2016-12-06,,,,Tennessee Whiskies,Jack Daniels Old #7 Black Lbl,$6.35,$6.35,1.20
24090,2016-10-12,,,,Spiced Rum,Captain Morgan Spiced Rum,$17.63,$211.56,1.00
25341,2017-02-02,,,,Tennessee Whiskies,Jack Daniels Old #7 Black Lbl,$6.35,$6.35,2.40
...,...,...,...,...,...,...,...,...,...
2499919,2016-12-08,,,,American Vodka,Five O'clock Vodka,$5.06,$60.72,0.75
2502910,2017-06-07,,,,American Flavored Vodka,New Amsterdam Apple Vodka,$11.25,$135.00,0.75
2503271,2017-02-09,,,,Spiced Rum,Sailor Jerry Spiced Navy Rum,$7.20,$7.20,1.50
2504150,2016-12-13,,,,Coffee Liqueurs,Kahlua Coffee Liqueur Mini,$9.90,$9.90,1.00


In [23]:
df[df['store_location'].isna()]

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
1941,2017-02-06,,,,Mixto Tequila,Jose Cuervo Especial Reposado Tequila Mini,$17.25,$17.25,1.50
12580,2017-04-14,,,,American Brandies,Paul Masson Grande Amber Brandy,$4.83,$115.92,0.37
15953,2016-12-06,,,,Tennessee Whiskies,Jack Daniels Old #7 Black Lbl,$6.35,$6.35,1.20
24090,2016-10-12,,,,Spiced Rum,Captain Morgan Spiced Rum,$17.63,$211.56,1.00
25341,2017-02-02,,,,Tennessee Whiskies,Jack Daniels Old #7 Black Lbl,$6.35,$6.35,2.40
...,...,...,...,...,...,...,...,...,...
2499919,2016-12-08,,,,American Vodka,Five O'clock Vodka,$5.06,$60.72,0.75
2502910,2017-06-07,,,,American Flavored Vodka,New Amsterdam Apple Vodka,$11.25,$135.00,0.75
2503271,2017-02-09,,,,Spiced Rum,Sailor Jerry Spiced Navy Rum,$7.20,$7.20,1.50
2504150,2016-12-13,,,,Coffee Liqueurs,Kahlua Coffee Liqueur Mini,$9.90,$9.90,1.00


In [24]:
df[df['county'].isna()]

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
112,2016-07-29,Hampton,721 Central Avenue West\nHampton 50441\n(42.74...,,CANADIAN WHISKIES,Black Velvet,$14.93,$89.58,10.50
270,2016-03-02,DAVENPORT,"2351 W LOCUST\nDAVENPORT 52804\n(41.538207, -9...",,SPICED RUM,Admiral Nelson Spiced Rum Mini,$5.67,$5.67,0.50
301,2016-07-29,Des Moines,"1516, SE 1st St\nDes Moines 50315\n(41.575862,...",,SCOTCH WHISKIES,Lauder's,$4.82,$9.64,0.75
373,2016-04-04,DUNLAP,"117 IOWA AVE\nDUNLAP 712-2\n(41.854728, -95.60...",,IMPORTED VODKA,Grey Goose Vodka,$27.74,$166.44,4.50
462,2016-08-10,Davenport,"2351 W Locust\nDavenport 52804\n(41.538207, -9...",,CANADIAN WHISKIES,Windsor Canadian,$6.30,$75.60,9.00
...,...,...,...,...,...,...,...,...,...
2517827,2016-06-15,Davenport,"2351 W Locust\nDavenport 52804\n(41.538207, -9...",,DISTILLED SPIRITS SPECIALTY,Midnight Moon Cherry,$17.25,$103.50,4.50
2517949,2016-06-21,Cresco,"708, 2nd Ave SE\nCresco 52136\n(43.37113, -92....",,SCOTCH WHISKIES,J & B Rare Scotch,$22.47,$44.94,2.00
2518139,2016-04-12,MARSHALLTOWN,"114, N CENTER ST\nMARSHALLTOWN 50158\n(42.0516...",,VODKA 80 PROOF,Hawkeye Vodka,$5.01,$60.12,9.00
2518145,2016-03-03,MASON CITY,"2771, 4th ST SW\nMASON CITY 50401\n(43.148281,...",,SCOTCH WHISKIES,Lauder's Scotch,$7.70,$15.40,1.50


In [36]:
df[df['category_name'].isna()]

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
979,2016-02-22,DES MOINES,"1460 2ND AVE\nDES MOINES 50314\n(41.60566, -93...",Polk,,Peligroso Silver,$22.50,$22.50,0.75
1328,2013-11-19,SHELDON,"1989 PARK ST\nSHELDON 51201\n(43.186038, -95.8...",O'Brien,,Evan Williams Egg Nog,$6.69,$80.28,9.00
2081,2016-02-24,DES MOINES,1434 DES MOINES ST STE 5\nDES MOINES 50316\n(4...,Polk,,D'usse VSOP,$24.51,$73.53,1.12
2943,2016-02-03,BOONE,"1111 8TH ST\nBOONE 50036\n(42.063833, -93.876651)",Boone,,Hennessy Vs Cognac,$28.49,$341.88,9.00
2962,2016-02-25,DAVENPORT,"4064 E 53RD ST\nDAVENPORT 52807\n(41.574973, -...",Scott,,Hennessy VS,$14.99,$149.90,3.75
...,...,...,...,...,...,...,...,...,...
2513367,2016-02-04,DES MOINES,3500 INGERSOLL AVE\nDES MOINES 50312\n(41.5863...,Polk,,Galliano Liqueur,$16.13,$32.26,0.75
2514250,2017-02-22,Altoona,"100 8th Street SW\nAltoona 50009\n(41.644041, ...",POLK,,Pikesville Rye 110prf HA,$37.44,$37.44,3.00
2514313,2016-06-28,Cedar Rapids,235 Edgewood Rd NE\nCedar Rapids 52405\n,Linn,,Sailor Jerry Spiced Navy Rum,$14.48,$72.40,3.75
2517418,2015-10-12,WASHINGTON,"106 W 2ND ST\nWASHINGTON 52353\n(41.300399, -9...",Washington,,Bacardi Oakheart,$21.00,$126.00,10.50


In [39]:
df = df.dropna()

## II. Limpar dados: Corrigir, imputar ou remover valores erroneamente inseridos nos conjuntos de dados.

#### Transformando a coluna data para formato date

In [40]:
df['date'] = pd.to_datetime(df['date'])

#### Padronizando valores

In [41]:
df['city']=df['city'].apply(slugify)

In [42]:
df['store_location']=df['store_location'].apply(slugify)

In [43]:
df['county']=df['county'].apply(slugify)

In [44]:
df['category_name']=df['category_name'].apply(slugify)

In [46]:
df['store_location']=df['store_location'].apply(slugify)

In [48]:
df['item_description']=df['item_description'].apply(slugify)

In [49]:
df.head()

Unnamed: 0,date,city,store_location,county,category_name,item_description,state_bottle_retail,sale_dollars,volume_sold_liters
0,2013-02-26,blue-grass,102-w-mayne-blue-grass-52726-41-509114-90-765769,scott,straight-bourbon-whiskies,wild-turkey-101-round,$8.99,$8.99,0.38
1,2017-08-22,indianola,1500-north-jefferson-indianola-50125,warren,canadian-whiskies,crown-royal-canadian-whisky,$47.99,$287.94,1.75
2,2013-08-28,storm-lake,1250-n-lake-st-storm-lake-50588-42-653594-95-2...,buena-vista,distilled-spirits-specialty,firefly-strawberry-moonshine,$14.82,$29.64,1.5
3,2013-03-26,manning,316-main-st-manning-51455-41-908573-95-065122,carroll,imported-schnapps,rumple-minze-peppermint-schnapps-liqueur,$22.72,$22.72,1.0
4,2017-08-17,des-moines,218-6th-ave-101-des-moines-50309-41-585684-93-...,polk,mixto-tequila,jose-cuervo-especial-reposado-tequila-mini,$17.25,$17.25,0.5


#### Procurando por mais inconsistências

##### cidades e condados com nome parecido

In [55]:
cities = sorted(df['city'].unique())
for r in reversed(cities):
    for i in cities:
        if distance(r,i)>0 and distance(r,i)<2:
            print(i)

creston
ottumwa
otumwa
le-mars
le-claire
lemars
leclaire
kellog
kellogg
guttenberg
guttenburg
grand-mound
grand-mounds
preston
clear-lake
clearlake
arnold-s-park
arnolds-park
afton
alton


In [60]:
counties = sorted(df['county'].unique())
for s in reversed(counties):
    for j in counties:
        if distance(s,j)>0 and distance(s,j)<2:
            print (s)

obrien
o-brien
cerro-gordo
cerro-gord
buena-vista
buena-vist


## III. Construir dados: derivar novos atributos que serão úteis. Por exemplo, derivar o IMC de alguém a partir da altura e peso.

In [None]:
# coordenadas geográficas

## IV. Integrar dados: criar novos conjuntos de dados combinando dados de várias fontes.

## V.  Formatar dados: Formatar novamente os dados conforme as necessidades dos modelos.