In [28]:
import pandas as pd
import numpy as np
from slugify import slugify
pd.set_option('display.max_columns', None)
pd. set_option("display.max_rows", None)
from Levenshtein import distance

## I. Selecionar os dados: determinar quais conjuntos de dados serão utilizados e documentar os motivos de inclusão/exclusão.

In [29]:
df = pd.read_csv("../data/raw/iowa_liquor_train_test_split_sample.csv", usecols=[2,10,20,22,23],encoding='utf-8')

In [30]:
df.columns = [slugify(col, lowercase=True, separator='_') for col in df.columns]
df.columns

Index(['date', 'county', 'state_bottle_retail', 'sale_dollars',
       'volume_sold_liters'],
      dtype='object')

df.head(1)

### Remoção das colunas as quais não iremos utilizar
#### Vamos deixar somente as colunas data, store_location, state_bottle_retail, salle_dollars, volume_sold_liters, pois vamos realizar regressão linear em dados numéricos

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518216 entries, 0 to 2518215
Data columns (total 5 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 object 
 1   county               object 
 2   state_bottle_retail  object 
 3   sale_dollars         object 
 4   volume_sold_liters   float64
dtypes: float64(1), object(4)
memory usage: 96.1+ MB


In [32]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,02/26/2013,Scott,$8.99,$8.99,0.38
1,08/22/2017,WARREN,$47.99,$287.94,1.75
2,08/28/2013,Buena Vista,$14.82,$29.64,1.5
3,03/26/2013,Carroll,$22.72,$22.72,1.0
4,08/17/2017,POLK,$17.25,$17.25,0.5


In [33]:
df.shape

(2518216, 5)

In [34]:
df[df['date'].isna()].shape

(0, 5)

In [35]:
df[df['county'].isna()].shape

(15650, 5)

In [36]:
df[df['state_bottle_retail'].isna()].shape

(2, 5)

In [37]:
df[df['sale_dollars'].isna()].shape

(2, 5)

In [38]:
df[df['volume_sold_liters'].isna()].shape

(0, 5)

#### Procurando por mais inconsistências

## II. Limpar dados: Corrigir, imputar ou remover valores erroneamente inseridos nos conjuntos de dados.

In [39]:
df = df.dropna()

#### Transformando a coluna data para formato date

In [40]:
df['date'] = pd.to_datetime(df['date'])

In [41]:
df['county']=df['county'].apply(slugify)

In [42]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,2013-02-26,scott,$8.99,$8.99,0.38
1,2017-08-22,warren,$47.99,$287.94,1.75
2,2013-08-28,buena-vista,$14.82,$29.64,1.5
3,2013-03-26,carroll,$22.72,$22.72,1.0
4,2017-08-17,polk,$17.25,$17.25,0.5


In [None]:
counties = df['county'].unique()

#### Procurando por mais inconsistências

##### condados com nome parecido

In [62]:
for a in counties:
    for b in reversed(counties):
        if distance(a,b)==1:
            print(a)

buena-vista
cerro-gordo
cerro-gord
buena-vist
o-brien
obrien


In [43]:
df['state_bottle_retail'] = df['state_bottle_retail'].replace({'\$':''}, regex = True).astype(float)

In [44]:
df['sale_dollars'] = df['sale_dollars'].replace({'\$':''}, regex = True).astype(float)

In [45]:
df['volume_sold_liters'] = df['volume_sold_liters'].astype(float)

In [46]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,2013-02-26,scott,8.99,8.99,0.38
1,2017-08-22,warren,47.99,287.94,1.75
2,2013-08-28,buena-vista,14.82,29.64,1.5
3,2013-03-26,carroll,22.72,22.72,1.0
4,2017-08-17,polk,17.25,17.25,0.5


## III. Construir dados: derivar novos atributos que serão úteis. Por exemplo, derivar o IMC de alguém a partir da altura e peso.

In [73]:
county_group = df.groupby(by=["county"]).sum()

In [75]:
county_group.nlargest(5, ['volume_sold_liters'])

Unnamed: 0_level_0,state_bottle_retail,sale_dollars,volume_sold_liters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
polk,6848168.0,64626760.0,3848694.0
linn,2973341.0,25643780.0,1683366.0
scott,2220852.0,20605600.0,1247357.0
black-hawk,1901177.0,16819880.0,1048071.0
johnson,1911741.0,17396000.0,1035691.0


## IV. Integrar dados: criar novos conjuntos de dados combinando dados de várias fontes.

In [76]:
pop_df = pd.read_csv("../data/raw/pop_cty_iowa.csv",usecols=['STNAME','CTYNAME','POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014',
                                                                          'POPESTIMATE2015','POPESTIMATE2016','POPESTIMATE2017'],dtype={
    'STNAME': str,
    'POPESTIMATE2012':str,
    'POPESTIMATE2013': str,
    'POPESTIMATE2014': str,
    'POPESTIMATE2015': str,
    'POPESTIMATE2016': str,
    'POPESTIMATE2017': str
})

In [78]:
pop_df['CTYNAME'] = pop_df['CTYNAME'].replace({' County':''}, regex = True).str.lower()

In [80]:
pop_df = pop_df[pop_df['STNAME']=="Iowa"]

In [81]:
pop_df.head()

Unnamed: 0,STNAME,CTYNAME,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017
804,Iowa,iowa,3076190,3092997,3109350,3120960,3131371,3141550
805,Iowa,adair,7468,7387,7368,7145,7005,7051
806,Iowa,adams,3910,3891,3877,3754,3692,3657
807,Iowa,allamakee,14149,14071,14062,13874,13851,13803
808,Iowa,appanoose,12707,12654,12671,12577,12505,12353


## V.  Formatar dados: Formatar novamente os dados conforme as necessidades dos modelos.