In [179]:
import pandas as pd
import numpy as np
from slugify import slugify
pd.set_option('display.max_columns', None)
pd. set_option("display.max_rows", None)
from Levenshtein import distance

## I. Selecionar os dados: determinar quais conjuntos de dados serão utilizados e documentar os motivos de inclusão/exclusão.

In [180]:
df = pd.read_csv("../data/raw/iowa_liquor_train_test_split_sample.csv", usecols=[2,10,20,22,23],encoding='utf-8')

In [181]:
df.columns = [slugify(col, lowercase=True, separator='_') for col in df.columns]
df.columns

Index(['date', 'county', 'state_bottle_retail', 'sale_dollars',
       'volume_sold_liters'],
      dtype='object')

df.head(1)

### Remoção das colunas as quais não iremos utilizar
#### Vamos deixar somente as colunas data, store_location, state_bottle_retail, salle_dollars, volume_sold_liters, pois vamos realizar regressão linear em dados numéricos

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518216 entries, 0 to 2518215
Data columns (total 5 columns):
 #   Column               Dtype  
---  ------               -----  
 0   date                 object 
 1   county               object 
 2   state_bottle_retail  object 
 3   sale_dollars         object 
 4   volume_sold_liters   float64
dtypes: float64(1), object(4)
memory usage: 96.1+ MB


In [183]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,02/26/2013,Scott,$8.99,$8.99,0.38
1,08/22/2017,WARREN,$47.99,$287.94,1.75
2,08/28/2013,Buena Vista,$14.82,$29.64,1.5
3,03/26/2013,Carroll,$22.72,$22.72,1.0
4,08/17/2017,POLK,$17.25,$17.25,0.5


In [184]:
df.shape

(2518216, 5)

In [185]:
df[df['date'].isna()].shape

(0, 5)

In [186]:
df[df['county'].isna()].shape

(15650, 5)

In [187]:
df[df['state_bottle_retail'].isna()].shape

(2, 5)

In [188]:
df[df['sale_dollars'].isna()].shape

(2, 5)

In [189]:
df[df['volume_sold_liters'].isna()].shape

(0, 5)

#### Procurando por mais inconsistências

## II. Limpar dados: Corrigir, imputar ou remover valores erroneamente inseridos nos conjuntos de dados.

In [190]:
df = df.dropna()

In [191]:
df['county']=df['county'].apply(slugify)

#### Transformando a coluna data para formato date

In [192]:
df['date'] = pd.to_datetime(df['date'])

In [193]:
df['county']=df['county'].apply(slugify)

In [194]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,2013-02-26,scott,$8.99,$8.99,0.38
1,2017-08-22,warren,$47.99,$287.94,1.75
2,2013-08-28,buena-vista,$14.82,$29.64,1.5
3,2013-03-26,carroll,$22.72,$22.72,1.0
4,2017-08-17,polk,$17.25,$17.25,0.5


In [195]:
counties = df['county'].unique()

#### Procurando por mais inconsistências

##### condados com nome parecido

In [196]:
for a in counties:
    for b in reversed(counties):
        if distance(a,b)==1:
            print(a)

buena-vista
cerro-gordo
cerro-gord
buena-vist
o-brien
obrien


In [197]:
df['state_bottle_retail'] = df['state_bottle_retail'].replace({'\$':''}, regex = True).astype(float)

In [198]:
df['sale_dollars'] = df['sale_dollars'].replace({'\$':''}, regex = True).astype(float)

In [199]:
df['volume_sold_liters'] = df['volume_sold_liters'].astype(float)

In [200]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
0,2013-02-26,scott,8.99,8.99,0.38
1,2017-08-22,warren,47.99,287.94,1.75
2,2013-08-28,buena-vista,14.82,29.64,1.5
3,2013-03-26,carroll,22.72,22.72,1.0
4,2017-08-17,polk,17.25,17.25,0.5


#### Utilizar os dados de 2016, pois é o mais recente e tem dados completos

In [201]:
df = df[df['date'].dt.year == 2016]

In [202]:
df.head()

Unnamed: 0,date,county,state_bottle_retail,sale_dollars,volume_sold_liters
10,2016-06-22,bremer,17.63,70.52,4.0
20,2016-12-16,hamilton,9.0,90.0,1.2
21,2016-12-22,scott,7.26,7.26,1.2
38,2016-01-19,page,15.68,376.32,42.0
39,2016-05-18,johnson,13.08,156.96,9.0


## III. Construir dados: derivar novos atributos que serão úteis. Por exemplo, derivar o IMC de alguém a partir da altura e peso.

In [203]:
county_group = df.groupby(by=["county"]).sum()

In [204]:
county_group.head()

Unnamed: 0_level_0,state_bottle_retail,sale_dollars,volume_sold_liters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
adair,13989.11,78170.39,5304.73
adams,6022.85,16566.53,1207.21
allamakee,30333.55,146542.63,9515.25
appanoose,25195.24,148019.33,9113.52
audubon,4055.67,24182.22,2095.8


In [217]:
most_liters = county_group.nlargest(5, ['volume_sold_liters'])

In [218]:
most_liters.head()

Unnamed: 0_level_0,state_bottle_retail,sale_dollars,volume_sold_liters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
polk,1279755.91,11106190.0,600075.43
linn,565423.56,4266498.0,253583.5
scott,403794.79,3319892.0,178965.03
johnson,335982.54,2929709.0,161005.7
black-hawk,361384.0,2786531.0,154994.32


In [220]:
most_liters = most_liters.sort_values('county')

In [221]:
most_liters.head()

Unnamed: 0_level_0,state_bottle_retail,sale_dollars,volume_sold_liters
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
black-hawk,361384.0,2786531.0,154994.32
johnson,335982.54,2929709.0,161005.7
linn,565423.56,4266498.0,253583.5
polk,1279755.91,11106190.0,600075.43
scott,403794.79,3319892.0,178965.03


## IV. Integrar dados: criar novos conjuntos de dados combinando dados de várias fontes.

In [208]:
pop_df = pd.read_csv("../data/raw/pop_cty_iowa.csv",usecols=['STNAME','CTYNAME','POPESTIMATE2016'])

In [209]:
pop_df['CTYNAME'] = pop_df['CTYNAME'].replace({' County':''}, regex = True).str.lower()

In [210]:
pop_df = pop_df[pop_df['STNAME']=="Iowa"]

In [211]:
pop_df['CTYNAME']=pop_df['CTYNAME'].apply(slugify)

In [212]:
pop_df['CTYNAME'].unique()

array(['iowa', 'adair', 'adams', 'allamakee', 'appanoose', 'audubon',
       'benton', 'black-hawk', 'boone', 'bremer', 'buchanan',
       'buena-vista', 'butler', 'calhoun', 'carroll', 'cass', 'cedar',
       'cerro-gordo', 'cherokee', 'chickasaw', 'clarke', 'clay',
       'clayton', 'clinton', 'crawford', 'dallas', 'davis', 'decatur',
       'delaware', 'des-moines', 'dickinson', 'dubuque', 'emmet',
       'fayette', 'floyd', 'franklin', 'fremont', 'greene', 'grundy',
       'guthrie', 'hamilton', 'hancock', 'hardin', 'harrison', 'henry',
       'howard', 'humboldt', 'ida', 'jackson', 'jasper', 'jefferson',
       'johnson', 'jones', 'keokuk', 'kossuth', 'lee', 'linn', 'louisa',
       'lucas', 'lyon', 'madison', 'mahaska', 'marion', 'marshall',
       'mills', 'mitchell', 'monona', 'monroe', 'montgomery', 'muscatine',
       'o-brien', 'osceola', 'page', 'palo-alto', 'plymouth',
       'pocahontas', 'polk', 'pottawattamie', 'poweshiek', 'ringgold',
       'sac', 'scott', 'shelby', '

In [213]:
pop_df = pop_df[ (pop_df['CTYNAME']=='polk') | (pop_df['CTYNAME']=='linn') | (pop_df['CTYNAME']=='scott') | (pop_df['CTYNAME']=='johnson') | (pop_df['CTYNAME']=='black-hawk')].groupby(by=["CTYNAME"])

In [214]:
pop_df.head()

Unnamed: 0,STNAME,CTYNAME,POPESTIMATE2016
811,Iowa,black-hawk,133077
856,Iowa,johnson,146928
861,Iowa,linn,222188
881,Iowa,polk,474277
886,Iowa,scott,172135


In [226]:
new_one["volume_sold_liters"] = most_liters['volume_sold_liters']

TypeError: 'DataFrameGroupBy' object does not support item assignment

In [None]:
most_liters.head()

In [None]:
liters_per_person.head()

## V.  Formatar dados: Formatar novamente os dados conforme as necessidades dos modelos.