## Importação de bibliotecas e dados externos

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
shark_attack = pd.read_csv('archive.zip', compression='zip', encoding='ansi')
shark_attack.tail()

### Verificando DataFrame

In [None]:
shark_attack.info()

In [None]:
# aumentando a quantidade de colunas do dataframe a serem visualizadas

pd.options.display.max_columns = 25

In [None]:
# descobrindo erros nos nomes das colunas

shark_attack.columns

##  Tratamento de linhas nulas

In [None]:
# criando uma coluna com a contagem de nulos em cada linha

shark_attack['Vazio'] = shark_attack.T.isnull().sum()

In [None]:
shark_attack.head()

In [None]:
shark_attack['Vazio'].value_counts()

In [None]:
# criando um dataframe com todas as linhas que possuem menos de 20 colunas vazias

f_shark = shark_attack.drop(shark_attack[shark_attack['Vazio'] > 20].index)

In [None]:
# verificando a quantidade de linhas x colunas vazias

f_shark['Vazio'].value_counts()

##  Escolha de colunas relevantes

In [None]:
f_shark['Unnamed: 23'].value_counts()

In [None]:
f_shark.loc[~f_shark['Unnamed: 23'].isnull(), :]

In [None]:
# escolhendo as colunas relevantes

f_shark = f_shark[['Date', 'Year', 'Type', 'Country', 'Area', 'Location', 'Activity', 'Name', 'Sex ', 'Age', 'Injury',
             'Fatal (Y/N)', 'Time', 'Species ']]

In [None]:
# renomeando as colunas

f_shark.columns = ['date', 'year', 'type', 'country', 'area', 'location', 'activity', 'name', 'sex', 'age', 'injury',
                 'fatal', 'time', 'species']

In [None]:
f_shark.tail()

## Tratamento de ano

In [None]:
# verificando linhas cujo ano pode ser completado com informação da coluna de data

f_shark.loc[~(f_shark['date']).isnull() & (f_shark['year'].isnull()), 'date']

In [None]:
# preenchendo valor de ano com informação de data

f_shark.loc[~(f_shark['date']).isnull() & (f_shark['year'].isnull()), 'year'] = f_shark.loc[~(f_shark['date']).isnull() & (f_shark['year'].isnull()), 'date'].str.slice(start=-4)

In [None]:
f_shark.loc[~(f_shark['date']).isnull() & (f_shark['year'].isnull()), 'date']

In [None]:
# transformando coluna ano em inteiros

f_shark['year'] = f_shark['year'].astype(int)

In [None]:
f_shark.info()

## Tratamento de país

In [None]:
f_shark['country']

In [None]:
# verificando linhas sem informação de país

f_shark.loc[(f_shark['country'].isnull()) & (~(f_shark['area'].isnull()) | ~(f_shark['location'].isnull())), ['country', 'area', 'location']]

In [None]:
# substituindo campos 'country' nulos por informações das colunas 'area' ou 'location'

f_shark.loc[(f_shark['country'].isnull()) & ((f_shark['area']=='St Helena') | ~(f_shark['area'].isnull())), 'country'] = f_shark.loc[(f_shark['country'].isnull()) & ((f_shark['area']=='St Helena') | ~(f_shark['area'].isnull())), 'area']
f_shark.loc[(f_shark['country'].isnull()) & ~(f_shark['location'].isnull()), 'country'] = f_shark.loc[(f_shark['country'].isnull()) & ~(f_shark['location'].isnull()), 'location']

In [None]:
# ajustando nomes de paises

f_shark['country'] = f_shark['country'].apply(lambda place: re.sub(' ?[\/\(].*', '', str(place)))
f_shark['country'] = f_shark['country'].str.lstrip(' ')
f_shark['country'] = f_shark['country'].str.rstrip('[ ?]')

In [None]:
# verificando linhas sem nenhuma informação do local

len(f_shark.loc[f_shark['country']=='nan', :])

In [None]:
def class_mar(pais):
    '''
    '''
    atlantico = ['']
    pacifico = ['hong kong', ]
    indico = ['']
    mediterraneo = ['']

In [None]:
f_shark['sea'] = 

## Tratamento de mês

In [None]:
def class_per(data):
    '''
    '''
    pad_mes = 'jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec'
    pad_est = 'fall|summer|winter|spring'
    
    if re.search(pad_mes, data.lower):
        per = 
        return re.findall(pad_)

In [None]:
pat_month = 'jan|feb|mar|ap[r]?|may|jun|jul|aug|sep|oct|nov|dec|fall|summer|winter|spring'

f_shark['month'] = f_shark['date'].apply(lambda date: re.findall(pat_month, date.lower())[0] if re.search(pat_month, date.lower()) else 0)

In [None]:
f_shark['month'].value_counts()

In [None]:
f_shark['month'] = f_shark['date'].apply(lambda date: re.sub('\d{4}', '', date))

In [None]:
f_shark['month']

In [None]:
f_shark.loc[f_shark['month']== 0, 'date'].apply(lambda date: re.findall('\.(\d{2})\.', date))

In [None]:
f_shark.loc[f_shark['month']==0, 'date'].apply(lambda date: re.sub('\d{4}', '', date)).value_counts()

## Tratamento de gênero

In [None]:
f_shark.loc[(f_shark['sex'] != 'M') & (f_shark['sex'] != 'F') & ~(f_shark['sex'].isnull()), 'sex'] = 'M'

In [None]:
f_shark.loc[f_shark['sex'].isnull(), ['name', 'injury']]

In [None]:
print(f_shark['sex'].value_counts().index)

In [None]:
shark_attack[['Date', 'Year', 'Type', 'Country', 'Area', 'Location','Activity', 'Name', 'Sex ', 'Age', 'Injury',
             'Fatal (Y/N)', 'Time', 'Species ']].loc[~(shark_attack['Name'].isnull())|~(shark_attack['Sex '].isnull()), :]