# Shark Attacks

### Importing libraries

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
os.listdir()

['shark_analysis.ipynb',
 'Untitled.ipynb',
 'attacks.csv',
 'README.md',
 '.gitattributes',
 '.ipynb_checkpoints',
 '.git']

### Reading and understanding database

In [3]:
db_attacks = pd.read_csv('attacks.csv', encoding='latin-1')

In [4]:
db_attacks.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,


In [5]:
db_attacks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
Case Number               8702 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object
href                      6302 non-null obje

### General Cleaning

#### Duplicates

In [8]:
db_attacks.drop_duplicates(keep = 'first', inplace = True)

#### Rows with less than 12 filled columns (50%)

In [12]:
db_attacks.dropna(axis = 0, thresh = 12, how = 'all', inplace = True)

In [13]:
db_attacks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6302 entries, 0 to 6301
Data columns (total 24 columns):
Case Number               6301 non-null object
Date                      6302 non-null object
Year                      6300 non-null float64
Type                      6298 non-null object
Country                   6252 non-null object
Area                      5847 non-null object
Location                  5762 non-null object
Activity                  5758 non-null object
Name                      6092 non-null object
Sex                       5737 non-null object
Age                       3471 non-null object
Injury                    6274 non-null object
Fatal (Y/N)               5763 non-null object
Time                      2948 non-null object
Species                   3464 non-null object
Investigator or Source    6285 non-null object
pdf                       6302 non-null object
href formula              6301 non-null object
href                      6302 non-null object

#### Columns with less than 5 filled rows (0.07%)

In [19]:
db_attacks.dropna(axis = 1, thresh = 5, how = 'all', inplace = True)

### Saving backup file

In [21]:
data_bk = db_attacks.copy()

### Formulating questions

a) Which country has had the biggest number of incidents?
> Is the ranking maintained if filtered by fatal accidents?

b) Within the unprovoked type of incident, which activity suffered the most?
> What about provoked accidents?

c) Which shark species is most associated with attacks?

d) Do the attacks happen with more frequency at a specific time of the day?

#### a) Which country has had the biggest number of incidents?

Selecting necessary columns to answer question:

In [23]:
db_attacks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

To identify countries with the most occurrences, the following columns will be used:

> Case Number: to be used as ID column | Country: to identify the location

Filtering dataframe to select only chosen columns

In [28]:
attacks_by_country = db_attacks[['Case Number', 'Country']].groupby(by = 'Country', as_index = False).count()

In [30]:
attacks_by_country.head()

Unnamed: 0,Country,Case Number
0,PHILIPPINES,1
1,TONGA,3
2,ADMIRALTY ISLANDS,1
3,AFRICA,1
4,ALGERIA,1


In [36]:
attacks_by_country.value_counts()

AttributeError: 'DataFrame' object has no attribute 'value_counts'

In [24]:
data_25 = data_b.sort_values(by = 'Case Number', ascending = False).head(25)

In [25]:
#df['col_name'] = df['col_name'].str.replace('G', '1')
data_25['Country'] = data_25['Country'].str.replace('REUNION', 'REUNION ISLAND') 

In [26]:
# a) Within the unprovoked type of incident, which activity suffered the most?
# What about provoked accidents?
data_25 = data_25[['Country', 'Case Number']].reset_index()

In [27]:
data_25

Unnamed: 0,index,Country,Case Number
0,204,USA,2228
1,14,AUSTRALIA,1338
2,171,SOUTH AFRICA,579
3,145,PAPUA NEW GUINEA,134
4,127,NEW ZEALAND,128
5,23,BRAZIL,112
6,16,BAHAMAS,109
7,113,MEXICO,89
8,90,ITALY,71
9,61,FIJI,62
