# Data Wrangling -- Shark Attacks

## 1. Import all necessary data and tools

In [74]:
import pandas as pd
import regex as re

In [2]:
shark_attacks = pd.read_csv('GSAF5.csv', engine = 'python')


## 2. Understand the dataset

In [3]:
#Take a first peak at the date
shark_attacks.head(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,
5,2016.09.15.R,15-Sep-16,2016,Boat,AUSTRALIA,Western Australia,Bunbury,Fishing,Occupant: Ben Stratton,,...,,"West Australian, 9/15/2016",2016.09.15.R-boat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.15.R,2016.09.15.R,5988,,
6,2016.09.11,11-Sep-16,2016,Unprovoked,USA,Florida,"Ponte Vedra, St. Johns County",Wading,male,M,...,3' to 4' shark,"News4Jax, 9/11/2016",2016.09.11-PonteVedra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.11,2016.09.11,5987,,
7,2016.09.07,07-Sep-16,2016,Unprovoked,USA,Hawaii,"Makaha, Oahu",Swimming,female,F,...,"Tiger shark, 10?","Hawaii News Now, 9/7/2016",2016.09.07-Oahu.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.07,2016.09.07,5986,,
8,2016.09.06,06-Sep-16,2016,Unprovoked,NEW CALEDONIA,North Province,Koumac,Kite surfing,David Jewell,M,...,,"TVANouvelles, 9/6/2016",2016.09.06-Jewell.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.06,2016.09.06,5985,,
9,2016.09.05.b,05-Sep-16,2016,Unprovoked,USA,South Carolina,"Kingston Plantation, Myrtle Beach, Horry County",Boogie boarding,Rylie Williams,F,...,,"C. Creswell, GSAF",2016.09.05.b-Williams.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.05.b,2016.09.05.b,5984,,


In [4]:
shark_attacks.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [5]:
shark_attacks.size

143808

In [6]:
#Look at the shape of the data (rows & columns)
shark_attacks.shape

(5992, 24)

## 3. Data Wrangling

### Handle missing values

In [7]:
#Look at missing values
shark_attacks.isna()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
1,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5987,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
5988,False,False,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,True
5989,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,True
5990,False,False,False,False,False,True,False,True,False,False,...,True,False,False,False,False,False,False,False,True,True


In [8]:
#Find out where missing values occur
shark_attacks.isna().sum()

Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [9]:
#Store missing values in a variable
missing_values = shark_attacks.isna().sum()

In [10]:
#Access missing values only 
missing_values.loc[missing_values.gt(0)]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

In [11]:
#Find out percentage of missing values per relevant column 
missing_values.loc[missing_values.gt(0)]/len(shark_attacks)

Country                   0.007176
Area                      0.067089
Location                  0.082777
Activity                  0.087951
Name                      0.033378
Sex                       0.094626
Age                       0.447430
Injury                    0.004506
Fatal (Y/N)               0.003171
Time                      0.536215
Species                   0.489653
Investigator or Source    0.002503
href formula              0.000167
href                      0.000501
Unnamed: 22               0.999833
Unnamed: 23               0.999666
dtype: float64

In [12]:
#Store the ratios in a variable
missing_values_ratios = missing_values.loc[missing_values.gt(0)]/len(shark_attacks)

In [13]:
#Identify all columns with percentages over 60%
missing_values_ratios.loc[missing_values_ratios.gt(0.6)].index

Index(['Unnamed: 22', 'Unnamed: 23'], dtype='object')

In [14]:
#Double check what is inside the columns 'Unnamed: 22' and 'Unnamed: 23'
missing_values['Unnamed: 22']

5991

In [15]:
#Drop columsn with ratios over 60%, create a copy and store it in a variable
shark_attacks_copy = shark_attacks.drop(columns=missing_values_ratios.loc[missing_values_ratios.gt(0.6)].index).copy()


In [16]:
#Check the result
shark_attacks_copy.head(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989
5,2016.09.15.R,15-Sep-16,2016,Boat,AUSTRALIA,Western Australia,Bunbury,Fishing,Occupant: Ben Stratton,,...,N,,,"West Australian, 9/15/2016",2016.09.15.R-boat.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.15.R,2016.09.15.R,5988
6,2016.09.11,11-Sep-16,2016,Unprovoked,USA,Florida,"Ponte Vedra, St. Johns County",Wading,male,M,...,N,15h15,3' to 4' shark,"News4Jax, 9/11/2016",2016.09.11-PonteVedra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.11,2016.09.11,5987
7,2016.09.07,07-Sep-16,2016,Unprovoked,USA,Hawaii,"Makaha, Oahu",Swimming,female,F,...,N,14h30,"Tiger shark, 10?","Hawaii News Now, 9/7/2016",2016.09.07-Oahu.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.07,2016.09.07,5986
8,2016.09.06,06-Sep-16,2016,Unprovoked,NEW CALEDONIA,North Province,Koumac,Kite surfing,David Jewell,M,...,Y,15h40,,"TVANouvelles, 9/6/2016",2016.09.06-Jewell.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.06,2016.09.06,5985
9,2016.09.05.b,05-Sep-16,2016,Unprovoked,USA,South Carolina,"Kingston Plantation, Myrtle Beach, Horry County",Boogie boarding,Rylie Williams,F,...,N,Late afternoon,,"C. Creswell, GSAF",2016.09.05.b-Williams.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.05.b,2016.09.05.b,5984


In [17]:
#Find out where still missing values occur
shark_attacks_copy.isna().sum()[shark_attacks_copy.isna().sum() > 0]

Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
href formula                 1
href                         3
dtype: int64

In [18]:
#Sort it and store it in a variable
missing_values_2 = shark_attacks_copy.isna().sum()[shark_attacks_copy.isna().sum() > 0].sort_values(ascending = False)

In [19]:
missing_values_2

Time                      3213
Species                   2934
Age                       2681
Sex                        567
Activity                   527
Location                   496
Area                       402
Name                       200
Country                     43
Injury                      27
Fatal (Y/N)                 19
Investigator or Source      15
href                         3
href formula                 1
dtype: int64

In [20]:
#Rename column with special characters 

shark_attacks_copy.columns = shark_attacks_copy.columns.str.replace('(','').str.replace(')','').str.replace('/','_')

In [21]:
#Check new dataframe
shark_attacks_copy.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal Y_N,Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989


In [65]:
#Check types in column time
shark_attacks_copy["Time"]

0       13h00
1       11h00
2       10h43
3         NaN
4         NaN
        ...  
5987      NaN
5988      NaN
5989      NaN
5990      NaN
5991      NaN
Name: Time, Length: 5992, dtype: object

In [70]:
shark_attacks_copy['Time'].dtypes

dtype('O')

In [77]:
shark_attacks_copy.isna()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal Y_N,Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5987,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,False,False,False
5988,False,False,False,False,False,False,True,False,False,False,...,False,True,True,False,False,False,False,False,False,False
5989,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,False,False,False,False
5990,False,False,False,False,False,True,False,True,False,False,...,False,True,True,False,False,False,False,False,False,False


In [84]:
shark_attacks_copy[shark_attacks_copy["Time"].notnull()]["Time"]

0                13h00
1                11h00
2                10h43
6                15h15
7                14h30
             ...      
5931         Afternoon
5932         Afternoon
5933         Afternoon
5949    Late afternoon
5955         Afternoon
Name: Time, Length: 2779, dtype: object

In [None]:
shark_attacks_copy[shark_attacks_copy["Time"].notnull()]["Time"].replace 

In [73]:
shark_attacks_copy["Time"].isna().count()

5992

In [40]:
#Investigate missing values in "Time" column (what to do with Fatal(Y/N)?)
relevant_columns = ["Time", "Date", "Fatal Y_N","Investigator or Source"]
check_time = shark_attacks_copy.loc[shark_attacks_copy["Time"].isna(), relevant_columns].head(30)

check_time.head(30)

Unnamed: 0,Time,Date,Fatal Y_N,Investigator or Source
3,,17-Sep-16,N,"The Age, 9/18/2016"
4,,16-Sep-16,N,"The Age, 9/16/2016"
5,,15-Sep-16,N,"West Australian, 9/15/2016"
11,,04-Sep-16,N,"Orlando Sentinel, 9/7/2016"
12,,01-Sep-16,N,"R. Collier, GSAF"
17,,07-Aug-16,N,"ABC, 8/11/2016"
18,,06-Aug-16,N,"SUP, 8/9/2015"
19,,04-Aug-16,N,"News 965, 8/5/2016"
21,,28-Jul-16,N,"Klassick, 7/28/2016"
22,,28-Jul-16,N,"ABC Australia, 7/28/2016"


In [54]:
#Find out the minimum date
check_time.Date.min()

'01-Sep-16'

In [55]:
#Find out the maximum date
check_time.Date.max()

'Reported 08-Jul-2016'

In [56]:
#Finding 1: Between 01 Sept and 08 Jul 2016 time was not recorded. 

In [60]:
check_time["count"] = 1
check_time.value_counts("Fatal Y_N")

AttributeError: 'DataFrame' object has no attribute 'value_counts'

In [61]:
#Check correlation of dates and time
times_copy = check_time.groupby(['Time'])

In [62]:
times_copy.head(30)

Unnamed: 0,Time,Date,Fatal Y_N,Investigator or Source,count
3,,17-Sep-16,N,"The Age, 9/18/2016",1
4,,16-Sep-16,N,"The Age, 9/16/2016",1
5,,15-Sep-16,N,"West Australian, 9/15/2016",1
11,,04-Sep-16,N,"Orlando Sentinel, 9/7/2016",1
12,,01-Sep-16,N,"R. Collier, GSAF",1
17,,07-Aug-16,N,"ABC, 8/11/2016",1
18,,06-Aug-16,N,"SUP, 8/9/2015",1
19,,04-Aug-16,N,"News 965, 8/5/2016",1
21,,28-Jul-16,N,"Klassick, 7/28/2016",1
22,,28-Jul-16,N,"ABC Australia, 7/28/2016",1


In [63]:
times_copy.count()

Unnamed: 0_level_0,Date,Fatal Y_N,Investigator or Source,count
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
