## The hypothesis on which the project is based is as follows: 
## "The highest proportion of attacks in the last century has taken place in the United States but the fatality level is lower""

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib as plt

In [2]:
sharks_attacks = pd.read_csv("../data/attacks.csv", encoding = "ISO-8859-1")

In [3]:
sharks_attacks.shape

(25723, 24)

In [4]:
sharks_attacks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


###  As first step, we start deleting all the columns that are not useful for us.

In [5]:
sharks_attacks = sharks_attacks.drop(['Type','Investigator or Source','Time','pdf', 'href formula', 'href','Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22', 'Unnamed: 23' ], axis=1)

In [6]:
sharks_attacks.columns

Index(['Case Number', 'Date', 'Year', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Species '],
      dtype='object')

In [7]:
sharks_attacks.shape

(25723, 13)

### Renaming two columns

In [8]:
sharks_attacks = sharks_attacks.rename(columns={'Fatal (Y/N)': 'Fatal'})

In [9]:
sharks_attacks = sharks_attacks.rename(columns={'Species ': 'Species'})

### Next, we eliminate all cells with null information related to species and attack fatality, by using "dropna"

In [10]:
attacks = sharks_attacks.dropna(subset = ['Fatal'])
attacks.shape

(5763, 13)

### Now we clean the "Fatal (Y/N)" column

In [11]:
attacks.loc[attacks["Fatal"] == "Y", "Fatal"] = "Y"
attacks.loc[attacks["Fatal"] == "y", "Fatal"] = "Y"
attacks.loc[attacks["Fatal"] == "F", "Fatal"] = "Y"
attacks.loc[attacks["Fatal"] == " N", "Fatal"] = "N"
attacks.loc[attacks["Fatal"] == "N ", "Fatal"] = "N"
attacks.loc[attacks["Fatal"] == "2017", "Fatal"] = "UNKNOWN"
attacks.loc[attacks["Fatal"] == "M", "Fatal"] = "UNKNOWN"
attacks["Fatal"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


N          4301
Y          1389
UNKNOWN      73
Name: Fatal, dtype: int64

### To continue cleaning the data, we deine a function that help us to filter between USA or other countries.


In [12]:
attacks.loc[attacks["Country"] == "USA", "Country"] = "USA"
attacks.loc[attacks["Country"] != "USA", "Country"] = "Other"
attacks["Country"].value_counts()

Other    3722
USA      2041
Name: Country, dtype: int64

In [13]:
def contains (country):
    if country == "USA":
        return "USA" 
    else:
        return "other" 

In [14]:
attacks ["Country"] = attacks["Country"].apply(contains)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks ["Country"] = attacks["Country"].apply(contains)


### We clean the year information.

In [15]:
attacks = attacks[attacks["Year"] >= 1900]

### As final step, we reset the index

In [16]:
final_attacks = attacks.reset_index(drop=True)

In [17]:
final_attacks.head()

Unnamed: 0,Case Number,Date,Year,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal,Species
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,White shark
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,
3,2018.06.08,08-Jun-2018,2018.0,other,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,2 m shark
4,2018.06.04,04-Jun-2018,2018.0,other,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,"Tiger shark, 3m"


#### Finally, we export our cleaned data to used in other files.

In [18]:
final_attacks.to_csv("../output/final_attacks_output.csv", index = False)