# Importing Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from IPython.display import display

# Reading CSV

In [2]:
shark_attack = pd.read_csv('data/attacks.csv', encoding='latin')
shark_attack


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,...,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,,
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,...,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,,
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,...,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,,
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,...,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,...,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,...,,,,,,,,,,
25719,,,,,,,,,,,...,,,,,,,,,,
25720,,,,,,,,,,,...,,,,,,,,,,
25721,,,,,,,,,,,...,,,,,,,,,,


Drop columns: 'Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf' and duplicated rows

In [3]:
shark_attack = shark_attack.drop(columns=['Unnamed: 22', 'Unnamed: 23', 'href formula', 'pdf']).drop_duplicates()

Removing rows that have 18 or more null values:

In [4]:
shark_attack = shark_attack.loc[~(shark_attack.isnull().sum(axis=1) >= 18), :]

In [5]:
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.inf)
shark_attack.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0


Removing 'Case Number.1' and 'Case Number.2' columns, because they are duplicated

In [6]:
shark_attack.drop(columns=['Case Number.1', 'Case Number.2'], inplace=True)


Removing 'Investigator or Source' column, because it won't be used for the analysis of the dataset.

In [7]:
shark_attack.drop(columns='Investigator or Source', inplace=True)

Removing 'original order' column because it won't be useful for the anlysis.

In [8]:
shark_attack.drop(columns='original order', inplace=True)

In [9]:
shark_attack.shape

(6302, 16)

# Categorizing body parts

In [10]:
def list_to_pattern(lst : list):
    '''
    The function receives a list of body part words and returns a regex pattern.
    
    Parameters:
    lst (list): List of body part words

    Returns:
    string: Returning regex pattern
    '''
    
    pattern = ''
    for word in lst:
        pattern += word + '|'
    pattern = pattern[:-1]
    return pattern

In [11]:
arm = ['hand', 'arm', 'forearm', 'wrist', 'shoulder', 'elbow', 'finger', 'thumb']
leg = ['ankle','knee','foot','feet','thigh','leg','calf','buttock','pelvis', 'shin', 'heel', 'toe']
head = ['head','neck','face', 'ear', 'nose', 'mouth']
torso = ['torso','chest', 'back', 'abdomen', 'hip']
fatal = ['fatal', 'death', 'kill']

Transforming the body parts lists to regular expressions:

In [12]:
arm_pattern = list_to_pattern(arm)
leg_pattern = list_to_pattern(leg)
head_pattern = list_to_pattern(head)
torso_pattern = list_to_pattern(torso)
fatal_pattern = list_to_pattern(fatal)

Creating new columns and setting the values to 0:

In [13]:
shark_attack['Arm'] = 0
shark_attack['Leg'] = 0
shark_attack['Head'] = 0
shark_attack['Torso'] = 0

In [14]:
l = -1

for injury in shark_attack['Injury']:
    a = f'{injury}'
    lst_arm_parts = re.findall(arm_pattern, a, flags=re.I)
    lst_leg_parts = re.findall(leg_pattern, a, flags=re.I)
    lst_head_parts = re.findall(head_pattern, a, flags=re.I)
    lst_torso_parts = re.findall(torso_pattern, a, flags=re.I)
    lst_fatal = re.findall(fatal_pattern, a, flags=re.I)
    l += 1
    if len(lst_arm_parts) > 0:
        shark_attack['Arm'].update(pd.Series([1], index=[l]))
    if len(lst_leg_parts) > 0:
        shark_attack['Leg'].update(pd.Series([1], index=[l]))
    if len(lst_head_parts) > 0:
        shark_attack['Head'].update(pd.Series([1], index=[l]))
    if len(lst_torso_parts) > 0:
        shark_attack['Torso'].update(pd.Series([1], index=[l]))

In [15]:
shark_attack.query('Arm == 0 and Leg == 0 and Head == 0 and Torso == 0')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
5,2018.06.03.b,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,,"No injury, board bitten",N,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6,2018.06.03.a,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18,FATAL,Y,Late afternoon,Tiger shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
13,2018.05.13.a,13-May-2018,2018.0,Invalid,ENGLAND,Cornwall,Off Land's End,Fishing,Max Berryman,M,21,Injured by teeth of a dead porbeagle shark he ...,N,08h15,Invalid incident,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
15,2018.05.12,12-May-2018,2018.0,Unprovoked,SOUTH AFRICA,Eastern Cape Province,"Nahoon Beach, East London",Surfing,male,M,,"Minor injury, marks on board",N,11h00,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6297,ND.0005,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,Roebuck Bay,Diving,male,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6298,ND.0004,Before 1903,0.0,Unprovoked,AUSTRALIA,Western Australia,,Pearl diving,Ahmun,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6299,ND.0003,1900-1905,0.0,Unprovoked,USA,North Carolina,Ocracoke Inlet,Swimming,Coast Guard personnel,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6300,ND.0002,1883-1889,0.0,Unprovoked,PANAMA,,"Panama Bay 8ºN, 79ºW",,Jules Patterson,M,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0


How many attacks in each body part?

In [16]:
print(f"Leg attacks: {shark_attack['Leg'].sum()}")
print(f"Arm attacks: {shark_attack['Arm'].sum()}")
print(f"Head attacks: {shark_attack['Head'].sum()}")
print(f"Torso attacks: {shark_attack['Torso'].sum()}")

Leg attacks: 2620
Arm attacks: 1169
Head attacks: 414
Torso attacks: 299


# Cleaning 'Sex' Column

In [17]:
shark_attack.rename(columns={'Sex ' : 'Sex'}, inplace=True)

Unique values for 'Sex':

In [18]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', 'N', '.'], dtype=object)

In [19]:
shark_attack.query('Sex == "N"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
4938,1934.07.11,11-Jul-1934,1934.0,Boating,AUSTRALIA,New South Wales,Cronulla,Fishing,"18' boat, occupants William & Leslie Newton",N,,No injury to occupants Sharks continually foll...,N,,"Blue pointer, 11'",http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6131,1801.12.18.R,Reported 18-Dec-1801,1801.0,Provoked,,,,Standing on landed shark's tail,Stephen Pettigew,N,,"FATAL, PROVOKED INCIDENT",Y,,12' shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0


According to the pdf with the informations about the incident, both people that had sex = 'N' was Male

In [20]:
shark_attack.loc[4938, 'Sex'] = 'M'
shark_attack.loc[6131, 'Sex'] = 'M'

In [21]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', 'lli', '.'], dtype=object)

In [22]:
shark_attack.query('Sex == "lli"')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
1624,2004.11.11.b,11-Nov-2004,2004.0,Unprovoked,USA,California,"Bunkers, Humboldt Bay, Eureka, Humboldt County",Surfing,Brian Kang,lli,38,"Lacerations to hand, knee & thigh",N,13h30,5.5 m [18'] white shark,http://sharkattackfile.net/spreadsheets/pdf_di...,1,1,0,0


According to the pdf with the informations about the incident, the person that had 'Sex' = 'lli' was Male

In [23]:
shark_attack.loc[1624, 'Sex'] = 'M'

In [24]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, 'M ', '.'], dtype=object)

Removing spaces before and after 'M' or 'F'

In [25]:
shark_attack.loc[~shark_attack['Sex'].isna(), 'Sex'] = shark_attack['Sex'].str.strip()

In [26]:
shark_attack['Sex'].unique()

array(['F', 'M', nan, '.'], dtype=object)

In [27]:
shark_attack.query('Sex == "."')

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
5437,1908.06.02.R,Reported 02-Jun-1908,1908.0,Sea Disaster,PAPUA NEW GUINEA,New Britain,Matupi,.,,.,,"Remains of 3 humans recovered from shark, but ...",Y,,Allegedly a 33-foot shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0


In [28]:
shark_attack.loc[5437, 'Sex'] = np.nan

In [29]:
shark_attack['Sex'].unique()

array(['F', 'M', nan], dtype=object)

In [30]:
shark_attack.loc[shark_attack['Sex'].isna(), :]

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
32,2018.04.09,09-Apr-2018,2018.0,Unprovoked,NEW CALEDONIA,,"Magenta Beach, Noumea",Windsurfing,,,,"No injury, shark bit board",N,17h00,2 m shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
59,2017.11.25.R,Reported 25-Nov-2017,2017.0,Sea Disaster,LIBYA,,Gars Garabulli,2 boats capsized,31 migrants,,,FATAL,Y,,Some drowned but other may have been killed by...,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
86,2017.09.14,Sep-2017,2017.0,Boating,AUSTRALIA,Westerm Australia,Esperance,Fishing,,,,"sharks rammed boats, no injury to occupants",N,,"White shark, 3.5m",http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
124,2017.07.02,02-Jul-2017,2017.0,Invalid,COMOROS,Anjouan,Moya,Fishing,,,,"Skull found in shark, a probable drowning & sc...",,,Shark involvement prior to death not confirmed,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
154,2017.04.17.b,17-Apr-2017,2017.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",,,,,Minor bite to the foot,N,Afternoon,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6212,ND-0107,Before 2004,0.0,Boat,MOZAMBIQUE,Inhambane Province,Off Inhambane,Fishing,"4.8-metre skiboat, Occupants: Rod Salm & 4 fri...",,,"No injury to occupants, shark bumped boat",N,,Whale shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6237,ND.0073,"No date, Before 1963",0.0,Unprovoked,SINGAPORE,,"Keppel Harbor, 2 miles from Singapore city ce...",Swimming,,,,Recovered,N,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6259,ND.0044,1941-1945,0.0,Sea Disaster,,,,A group of survivors on a raft for 17-days,C.,,,"FATAL, shark leapt into raft and bit the man w...",Y,Late afternoon,1.2 m [4'] shark,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0
6278,ND.0024,Between 1918 & 1939,0.0,Unprovoked,REUNION,Saint-Denis,Barachois,Swimming,,,,FATAL,Y,,,http://sharkattackfile.net/spreadsheets/pdf_di...,0,0,0,0


In [32]:
shark_attack.shape


(6302, 20)

# Cleaning 'Fatal (Y/N)' column

#### Next, we start cleaning the column that says whether the injury was fatal or not

In [33]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4293
Y          1388
UNKNOWN      71
 N            7
N             1
2017          1
M             1
y             1
Name: Fatal (Y/N), dtype: int64

In [35]:
shark_attack.replace(to_replace =' ?N ?', value = 'N', inplace=True, regex = True) # Remove espaces.
shark_attack.replace(to_replace =' ?[NnF]', value = 'N', inplace=True, regex = True) # Adjust the values
shark_attack.replace(to_replace ='y', value = 'Y', inplace=True, regex = True) # Turn it into capital 'Y'

In [36]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4301
Y          1389
UNKNOWN      71
2017          1
M             1
Name: Fatal (Y/N), dtype: int64

#### Later, we look for the values that differ from Y, N or UNKNOWN

In [38]:
shark_attack.loc[shark_attack['Fatal (Y/N)'] == '2017']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
786,2012.06.10,10-JuN-2012,2012.0,Provoked,ITALY,SardiNia,Muravera,AttemptiNg to rescue aN iNjured & beached shark,Giorgio Zara,M,57,Lower left leg iNjured PROVOKED ACCIDENT,2017,MorNiNg,"Blue shark, 2.5m",http://sharkattackfile.Net/spreadsheets/pdf_di...,0,1,0,0


#### Lower left leg injured PROVOKED ACCIDENT, It was not fatal. Therefore we switch it to N	

In [39]:
shark_attack.replace(to_replace ='2017', value = 'N', inplace=True, regex = True) 

In [40]:
shark_attack.loc[shark_attack['Fatal (Y/N)'] == 'M']

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
285,2016.04.18.b,18-Apr-2016,2016.0,Provoked,NRENCH POLYNESIA,Tuamotos,Makemo Atoll,SpearfishiNg,Hoata Iotua,M,22,LaceratioN to kNee bY speared shark PROVOKED I...,M,MorNiNg,"GreY reef shark, 2 m",http://sharkattackfile.Net/spreadsheets/pdf_di...,0,1,1,0


#### Laceration to knee by speared shark PROVOKED INCIDENT. It was not fatal. Therefore we switch it to N

In [42]:
shark_attack.replace(to_replace ='M', value = 'N', inplace=True, regex = True) 

In [43]:
shark_attack['Fatal (Y/N)'].value_counts()

N          4303
Y          1389
UNKNOWN      71
Name: Fatal (Y/N), dtype: int64

# Cleaning 'Year' column

#### Next, we start cleaning the Year column.

In [167]:
shark_attack['Year'].value_counts().sum(axis=0)

6300

In [138]:
#mask = type(shark_attack['Year']) == float64
shark_attack['Year'] =shark_att ack['Year'].astype(np.int64)
#mask.tail(60)
#np.where(shark_attack['Year'] == np.float64, shark_attack['Year'].astype(np.int64), shark_attack['Year'])
shark_attack.loc[shark_attack['Year'] == 0].head(60)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,href,Arm,Leg,Head,Torso
6177,0000.0214,Ca. 214 B.C.,0.0,UNprovoked,,IoNiaN Sea,,AsceNdiNg from a dive,"TharsYs, a spoNge diver",N,,"NATAL, shark/s bit him iN two",Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0
6178,0000.0336,Ca. 336.B.C..,0.0,UNprovoked,GREECE,Piraeus,IN the haveN of CaNtharus,WashiNg his pig iN preparatioN for a religious...,A caNdidate for iNitiatioN,N,,"NATAL, shark ""bit off all lower parts of him u...",Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0
6179,0000.0493,493 B.C.,0.0,Sea Disaster,GREECE,Off ThessalY,,Shipwrecked PersiaNNleet,males,N,,Herodotus tells of sharks attackiNg meN iN the...,Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0
6180,0000.0725,Ca. 725 B.C.,0.0,Sea Disaster,ITALY,TYrrheNiaN Sea,Krater fouNd duriNg excavatioNs at Lacco AmeNo...,Shipwreck,males,N,,Depicts shipwrecked sailors attacked bY a sha...,Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,1
6181,ND-0154,Before 1939,0.0,UNprovoked,CANADA,,GraNd BaNks,NishiNg,JoeNolsom,N,,Arm bitteN,N,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,1,0,0,0
6182,ND-0153,1990 or 1991,0.0,UNprovoked,KENYA,Nombasa,KiliNdiNi,DiviNg,CoNwaY Plough & Dr. JoNathaN Higgs,N,,CoNwaY's leg was bitteN Higgs iNjurY wasNATAL,N,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,1,0,0
6183,ND-0152,Before 2016,0.0,UNprovoked,KENYA,Nombasa,KiliNdiNi,DiviNg,HamisiNjeNga,N,,NATAL,Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0
6184,ND-0151,Before Oct-2009,0.0,UNprovoked,PANANA,Bocas del Toro ProviNce,RedNrog Beach,SwimmiNg/,male,N,20.0,NATAL,Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0
6185,ND-0150,Before 1934,0.0,UNprovoked,URUGUAY,Rocha,"Isla Chica, La Paloma",SwimmiNg,,,,Noot bitteN,N,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,1,0,0
6186,ND-0149,Before 1934,0.0,UNprovoked,URUGUAY,Rocha,"PlaYa del Barco, La Pedrera",SwimmiNg,Naciello,N,,NATAL,Y,,,http://sharkattackfile.Net/spreadsheets/pdf_di...,0,0,0,0


In [161]:
shark_attack.Year.loc[shark_attack['Year'] != ]

0       2018.0
1       2018.0
2       2018.0
3       2018.0
4       2018.0
         ...  
6297       0.0
6298       0.0
6299       0.0
6300       0.0
6301       0.0
Name: Year, Length: 6302, dtype: float64

In [174]:
shark_attack['Year'].quantile(np.arange(0,1.1, 0.1))

0.0       0.0
0.1    1893.0
0.2    1930.0
0.3    1952.0
0.4    1962.0
0.5    1977.0
0.6    1992.0
0.7    2001.0
0.8    2008.0
0.9    2013.0
1.0    2018.0
Name: Year, dtype: float64

In [199]:
shark_attack['Year'].loc[shark_attack['Year'].isna()]


187    NaN
6079   NaN
Name: Year, dtype: float64

In [202]:
shark_attack.loc[187, :]

Case Number                                            N.01.08.R
Date                                           Reported 08-JaN-N
Year                                                         NaN
Type                                                     INvalid
Country                                                AUSTRALIA
Area                                                  QueeNslaNd
Location                                                     NaN
Activity                                            SpearfishiNg
Name                                                KerrY DaNiel
Sex                                                            N
Age                                                           35
Injury                    No attack, shark made a threat displaY
Fatal (Y/N)                                                  NaN
Time                                                         NaN
Species                                               Bull shark
href           http://sha

In [203]:
shark_attack['Year'].fillna(2017, inplace=True) # According to the report, the shark threatened the victim

In [205]:
shark_attack.loc[6079, :]


Case Number                                         1836.08.19.R
Date                                        Reported 19-Aug-1836
Year                                                        2017
Type                                                  UNprovoked
Country                                                  ENGLAND
Area                                                  CumberlaNd
Location                                              WhitehaveN
Activity                                                SwimmiNg
Name                                                       a boY
Sex                                                            N
Age                                                          NaN
Injury                                                     NATAL
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

In [206]:
shark_attack['Year'].replace(to_replace=2017, value=1836, inplace=True)  # According to the report, it happened in 1836

In [207]:
shark_attack.loc[6079, :]

Case Number                                         1836.08.19.R
Date                                        Reported 19-Aug-1836
Year                                                        1836
Type                                                  UNprovoked
Country                                                  ENGLAND
Area                                                  CumberlaNd
Location                                              WhitehaveN
Activity                                                SwimmiNg
Name                                                       a boY
Sex                                                            N
Age                                                          NaN
Injury                                                     NATAL
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

In [211]:
shark_attack['Year'].value_counts().sum()


6302

In [214]:
shark_attack['Year'].loc[shark_attack['Year']==0]


6177    0.0
6178    0.0
6179    0.0
6180    0.0
6181    0.0
       ... 
6297    0.0
6298    0.0
6299    0.0
6300    0.0
6301    0.0
Name: Year, Length: 125, dtype: float64

In [216]:
shark_attack.loc[6177, :]

Case Number                                            0000.0214
Date                                                Ca. 214 B.C.
Year                                                           0
Type                                                  UNprovoked
Country                                                      NaN
Area                                                  IoNiaN Sea
Location                                                     NaN
Activity                                   AsceNdiNg from a dive
Name                                     TharsYs, a spoNge diver
Sex                                                            N
Age                                                          NaN
Injury                             NATAL, shark/s bit him iN two
Fatal (Y/N)                                                    Y
Time                                                         NaN
Species                                                      NaN
href           http://sha

#### Transform the values in column 'Year' into integers

In [220]:
shark_attack['Year'] = shark_attack['Year'].astype(np.int64) 
