# Shark Attack !!!

## Cleaning...

vamos a limpiar los datos de los tiburones, let´s go!!! 

In [52]:
import pandas as pd
import numpy as np
import cleaning_functions as cf

### cargamos el csv:

In [53]:
df = pd.read_csv("DATA/attacks.csv",encoding = "ISO-8859-1")

### hay muchas columnas que no me importan, las elimino:

In [54]:
df.drop(["Name",'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'], axis=1, inplace=True)

### tambien eliminamos las filas con TODO NaN, y los duplicados:

In [55]:
df.dropna(axis=0, how='all', inplace=True)

In [56]:
df.drop_duplicates(keep='first', inplace=True)

### ya tenemos el dataframe un poco más manejable:

In [57]:
df.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,


# 1ª hipótesis: "los ataques son más frecuentes en verano"

### queremos sacar una nueva columna para el mes, a partir de la columna "Date"
#### para ello usaremos nuestras funciones "sacar_mes" y "limpiar_mes" :


In [61]:
df["Month"]=df.Date.apply(cf.sacar_mes)

In [62]:
df["clean_month"]=df.Month.apply(cf.limpiar_mes)

In [63]:
df.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Month,clean_month
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,Jun,Jun
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,,Jun,Jun
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,Jun,Jun


### el problema es que para definir qué es "verano" necesitamos saber en qué hemisferio estamos.
#### lo sacaremos a través de la columna "Area":

In [66]:
len(list(df.Area.unique()))

826

#### 826 valores son demasiados para meterlos a mano!
#### no lo haremos con los registros que supongan menos de un 20%

In [64]:
df.Area.value_counts().head(60)

Florida                              1037
New South Wales                       486
Queensland                            311
Hawaii                                298
California                            290
KwaZulu-Natal                         213
Western Cape Province                 195
Western Australia                     189
Eastern Cape Province                 160
South Carolina                        160
South Australia                       104
North Carolina                        101
Victoria                               90
Pernambuco                             74
Texas                                  73
Torres Strait                          70
North Island                           67
New Jersey                             52
Tasmania                               41
South Island                           40
New York                               30
Oregon                                 29
Abaco Islands                          23
Northern Territory                

In [68]:
areas=pd.DataFrame(df.Area.value_counts())

In [69]:
areas["Pareto"]=areas.Area/(df.shape[0])*100
areas["Accumu"]=0.0

In [71]:
for i in range(825):
    k=0
    for j in range(i+1):
        k+=areas.Pareto[j]
    areas["Accumu"][i]=k

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  areas["Accumu"][i]=k


In [73]:
areas.head(50)


Unnamed: 0,Area,Pareto,Accumu
Florida,1037,16.449873,16.449873
New South Wales,486,7.709391,24.159264
Queensland,311,4.933376,29.09264
Hawaii,298,4.727157,33.819797
California,290,4.600254,38.420051
KwaZulu-Natal,213,3.378807,41.798858
Western Cape Province,195,3.093274,44.892132
Western Australia,189,2.998096,47.890228
Eastern Cape Province,160,2.538071,50.428299
South Carolina,160,2.538071,52.966371


### muy bien, lo haremos sólo para estos 50 primeros valores

#### creamos una lista con los mares asociados a cada área:

In [74]:
seas=["car","pas","pas","pan","pan","ind","ats","ind","ind","atn","ind","atn","pas","ats","car","pas","pas","atn","pas","pas","atn","pan","car","pas","pas","car","atn","pas","ind","atn","car","pas","car","pas","car","atn","pan","pan","ind","med","pas","car","ats","med","med","car","pas","car","pan","pas"]

#### y una nueva columna "Hemisferio" inicialmente sin datos:

In [75]:
areas["Sea"]="-"
for i in range(50):
    areas.Sea[i]=seas[i]
areas["Hemisphere"]="-"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  areas.Sea[i]=seas[i]


#### a través de un diccionario, asociamos mares y hemisferios:

In [76]:
hemidic={"car":"norte","atn":"norte","pan":"norte","med":"norte","ats":"sur","pas":"sur","ind":"sur","-":"-"}

In [77]:
areas.Hemisphere=areas.Sea.map(hemidic)

In [78]:
areas.head(20)

Unnamed: 0,Area,Pareto,Accumu,Sea,Hemisphere
Florida,1037,16.449873,16.449873,car,norte
New South Wales,486,7.709391,24.159264,pas,sur
Queensland,311,4.933376,29.09264,pas,sur
Hawaii,298,4.727157,33.819797,pan,norte
California,290,4.600254,38.420051,pan,norte
KwaZulu-Natal,213,3.378807,41.798858,ind,sur
Western Cape Province,195,3.093274,44.892132,ats,sur
Western Australia,189,2.998096,47.890228,ind,sur
Eastern Cape Province,160,2.538071,50.428299,ind,sur
South Carolina,160,2.538071,52.966371,atn,norte


In [84]:
areas.rename(columns={"Area":"cases"}, inplace=True)
areas.drop(['Pareto'], axis=1,inplace=True)

In [86]:
areas.head(10)

Unnamed: 0,cases,Accumu,Sea,Hemisphere
Florida,1037,16.449873,car,norte
New South Wales,486,24.159264,pas,sur
Queensland,311,29.09264,pas,sur
Hawaii,298,33.819797,pan,norte
California,290,38.420051,pan,norte
KwaZulu-Natal,213,41.798858,ind,sur
Western Cape Province,195,44.892132,ats,sur
Western Australia,189,47.890228,ind,sur
Eastern Cape Province,160,50.428299,ind,sur
South Carolina,160,52.966371,atn,norte


#### esta tabla además nos permitirá ver la distribución de ataques según el mar (2ª hipótesis)

### ahora cruzaremos los datos de "mes" con los de "hemisferio"

In [104]:
dic={}
for i in range(50):
    dic[list(areas.index)[i]]=list(areas.Hemisphere)[i]

In [112]:
dic["California"]

'norte'

In [117]:
df.Hemi=df.Area.map(dic)

In [118]:
df.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Month,clean_month,Hemi
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,Jun,Jun,norte
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,,Jun,Jun,norte
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,Jun,Jun,norte


In [107]:
ver_nor={"Jan":}

NameError: name 'num_mont' is not defined

In [88]:
df.head(3)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Sex,Age,Injury,Fatal (Y/N),Time,Species,Month,clean_month
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,Jun,Jun
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,F,11,Minor injury to left thigh,N,14h00 -15h00,,Jun,Jun
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,Jun,Jun


In [89]:
df.clean_month.value_counts()

Jul     667
Aug     598
nulo    575
Sep     556
Jan     516
Jun     498
Apr     453
Oct     443
Dec     436
Mar     412
Nov     403
Feb     380
May     367
Name: clean_month, dtype: int64

In [39]:
# vamos a intentar sacar valores limpios de la especie:

In [40]:
dic1={"Species ":"Species"}

In [41]:
df.rename(columns=dic1, inplace=True)

In [42]:
list(df["Species"].unique())

['White shark',
 nan,
 '2 m shark',
 'Tiger shark, 3m',
 'Tiger shark',
 "Lemon shark, 3'",
 "Bull shark, 6'",
 'Grey reef shark',
 'Invalid incident',
 'Tawny nurse shark, 2m',
 'Shark involvement not confirmed',
 'Questionable',
 '3 m shark',
 'White shark, 3.5 m',
 'White shark, 2.5 m',
 "6' shark",
 'Juvenile bull shark',
 'Bull shark',
 "Tiger shark, 12'",
 'Wobbegong shark',
 '3.5 m shark',
 '1.8 m shark',
 'Blacktip shark',
 'Juvenile white shark,  2.7 to 3.2 m',
 'Bull shark, 2 m',
 'Possibly a wobbegong',
 'Injury believed caused by an eel, not a shark',
 'Galapagos shark?',
 '2m shark',
 'Bull shark, 3 m ',
 'Grey reef shark. 2 m',
 'small shark',
 'Wobbegong shark?',
 'Juvenile nurse shark',
 "Nurse shark. 5'",
 'Tiger shark, female',
 'Some drowned but other may have been killed by blue sharks',
 'White shark, 4.6 m',
 'Cookiecutter shark',
 'Wobbegong shark, 1 m',
 'White shark, 4.5 m',
 'Spinner shark, 4 to 5 feet',
 'Tiger shark, 8 to 10 feet',
 "8' shark",
 'Death may h

In [43]:
len(list(df["Species"].unique()))

1550

In [45]:
df["clean_specie"]=df.Species.apply(cf.spec_only)

In [46]:
list(df["clean_specie"].unique())

['white shark',
 'unknown',
 'm shark',
 'tiger shark',
 'lemon shark',
 'bull shark',
 'reef shark',
 'nurse shark',
 'wobbegong shark',
 'blacktip shark',
 'a shark',
 'galapagos shark',
 '2m shark',
 'small shark',
 'blue shark',
 'cookiecutter shark',
 'spinner shark',
 'whitetip shark',
 'sandtiger shark',
 'no shark',
 'gill shark',
 '3m shark',
 'sevengill shark',
 'angel shark',
 'dogfish shark',
 'mako shark',
 'whaler shark',
 'silky shark',
 'juvenile shark',
 'hammerhead shark',
 'but shark',
 'foot shark',
 'raggedtooth shark',
 'goblin shark',
 'metre shark',
 'sandbar shark',
 'cow shark',
 '1m shark',
 'salmon shark',
 'porbeagle shark',
 'jackson shark',
 'zambesi shark',
 '30kg shark',
 'thresher shark',
 'whale shark',
 'kg shark',
 'cutter shark',
 'dusky shark',
 'smoothhound shark',
 'basking shark',
 'as shark',
 'sand shark',
 'same shark',
 'copper shark',
 '2 shark',
 'brown shark',
 'colored shark',
 'captive shark',
 'bonnethed shark',
 'finned shark',
 'sou

In [47]:
len(list(df["clean_specie"].unique()))

103

In [48]:
df.clean_specie.value_counts().head(20)

unknown              4096
white shark           634
tiger shark           260
bull shark            177
m shark               129
nurse shark            97
whaler shark           66
reef shark             64
blacktip shark         61
small shark            55
mako shark             53
wobbegong shark        46
spinner shark          44
hammerhead shark       44
raggedtooth shark      43
blue shark             39
lemon shark            37
zambesi shark          29
whitetip shark         23
no shark               23
Name: clean_specie, dtype: int64