In [723]:
import pandas as pd
import SRC.function as F
import re
df = pd.read_csv("INPUT/GSAF5.csv", encoding = "cp1252")
#df.head()


In [724]:
# Remove columns with NaN values for all or almost all elements

null_cols = df.isnull().sum()
dim = df.shape
print("\nTotal number of values: ",dim[0])
print("\nTotal count of NaN values: \n\n",null_cols)

# we remove columns with at least 90% of NaN

drop_cols = list(null_cols[null_cols > df.shape[0]*0.9].index)
print("\nColumns to drop: \n",*drop_cols)
df = df.drop(drop_cols,axis=1)



Total number of values:  5992

Total count of NaN values: 

 Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

Columns to drop: 
 Unnamed: 22 Unnamed: 23


In [725]:
# the next step is to remove similar columns
# we can also delete columns "Case Number.1" and "Case Number.2", as they are very similar to "Case Number" column
                             
drop1 = sum(df["Case Number"]==df["Case Number.1"])
drop2 = sum(df["Case Number"]==df["Case Number.2"])
                           
print("\nCommon elements between Case Number and Case Number.1: {} ({}%)".format(drop1,round(drop1*100/dim[0],2)))
print("Common elements between Case Number and Case Number.2: {} ({}%)".format(drop2,round(drop2*100/dim[0],2)))

df = df.drop(["Case Number.1","Case Number.2"],axis=1)

# because "href" and "href formula" are the same, one of them can also be removed

drop3 = sum(df["href"]==df["href formula"])
                           
print("Common elements between href and href formula: {} ({}%)".format(drop3,round(drop3*100/dim[0],2)))
df = df.drop(["href formula"], axis=1)


Common elements between Case Number and Case Number.1: 5979 (99.78%)
Common elements between Case Number and Case Number.2: 5990 (99.97%)
Common elements between href and href formula: 5938 (99.1%)


In [726]:
# we can check if there are any rows with a high amount of NaN

df_rows = df.T
null_rows = df_rows.isnull().sum()
print(max(null_rows))

print("no rows can be removed as they all provide a significant amount of information")

#df.head()

9
no rows can be removed as they all provide a significant amount of information


In [727]:
# we must check if there are any duplicate rows 

duplicates = df[df.duplicated()]
print(len(duplicates))

0


In [728]:
# First we check all the columns and the data type of each one of them

print(df.dtypes)
df = df.astype(str)
df = df.fillna("0")


Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href                      object
original order             int64
dtype: object


In [729]:
####################################################### DATE #########################################################
date = df["Date"]
year = df["Year"]
for y in range(0,len(year)):
    if re.search("\d\d\d\d",year[y])==None:
        df["Year"][y] = "Unknown"
        
    
for x in range(0,len(date)):
    if re.search("(?i)\d\d-\w\w\w-\d\d\d\d",date[x]):
        if re.search("(?i)Reported\s\d\d-\w\w\w-\d\d\d\d",date[x]):
            df["Date"][x] = re.findall("(?i)\d\d-\w\w\w-\d\d\d\d",date[x])
    else:
        if re.search("(?i)\d\d-\w\w\w-\d\d",date[x]):
            df["Date"][x] = date[x][0:2]+"-"+date[x][3:6]+"-"+year[x]
        elif re.search("(?i)\s?\w\w\w-\d\d$",date[x]):
            if year[x]!="Unknown":
                df["Date"][x] = "XX-"+date[x][0:3]+"-"+year[x]
            else:
                df["Date"][x] = "Unknown"
        elif re.search("(?i)^\d\d\d\d$",date[x]):
            df["Date"][x] = "DD-MMMM-"+date[x]
        else: 
            if year[x]!="Unknown":
                df["Date"][x] = "XX-MMM-"+year[x]
            else:
                df["Date"][x] = "Unknown"
            
    


In [730]:
##################################################### CASE NUMBER ########################################################

date = list(df["Case Number"])
n = 0
for i in range(0,len(date)):
    date[i] = re.sub("\.","/",date[i])
    date[i] = date[i][0:10]
    df["Case Number"][i] = date[i]    
for i in range(0,len(date)):   
    if date[i][5:10] == "00/00":
        df["Case Number"][i] = date[i][0:4]
    elif date[i][0:2]=="ND":
        n += 1
        df["Case Number"][i] = "Unidentified #{}".format(n)

        
# we set an ID for each case also print date and reorder the data according to original order column, which can be removed
for c in range(0,len(date)):
    df["Case Number"][c] = str(c)  

df = df.drop(["original order"], axis=1)
df = df.rename(columns=({"Case Number":"Case Id"}))

In [731]:
######################################################## TYPE ########################################################

F.renameF(df,"Type","Boat","Unprovoked")
F.renameF(df,"Type","Boating","Unprovoked")
F.renameF(df,"Type","Invalid","Unknown")
F.renameF(df,"Type","Sea Disaster","Unprovoked")

print(df["Type"].value_counts())


Unprovoked    4916
Provoked       557
Unknown        519
Name: Type, dtype: int64


In [732]:
#################################################### ACTIVITY ########################################################

e = df["Activity"]

kite = e.apply(F.findF,args=("kite|board|kayak|cano|sail|wake|row",))
dive = e.apply(F.findF,args=("div",))
swim = e.apply(F.findF,args=("swim|wad|snork|water|float|bath",))
fish = e.apply(F.findF,args=("fish|collect",))
surf = e.apply(F.findF,args=("surf",))

for i in range(len(e)):
    if kite[i]!=None:
        df["Activity"][i] = "Water sports"
    elif dive[i]!=None:
        df["Activity"][i] = "Diving"
    elif swim[i]!=None:
        df["Activity"][i] = "Swimming"
    elif fish[i]!=None:
        df["Activity"][i] = "Fishing"
    elif surf[i]!=None:
        df["Activity"][i] = "Surfing"
    else:
        df["Activity"][i] = "Unknown"
    


print(df["Activity"].unique())
df.head()

['Surfing' 'Fishing' 'Swimming' 'Water sports' 'Unknown' 'Diving']


Unnamed: 0,Case Id,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href
0,0,18-Sep-2016,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
1,1,18-Sep-2016,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
2,2,18-Sep-2016,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
3,3,17-Sep-2016,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...
4,4,16-Sep-2016,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...


In [733]:
country = df["Country"]
for x in range(len(country)):
    if country[x] == "Nan":
        df["Country"] = "Unknown"
    if re.search("(?i)\w\?",country[x]):
        country[x] = re.sub("\?","",country[x])
        df["Country"][x] = country[x].title()
    if country[x]!="USA" and country[x] != "UNITED ARAB EMIRATES (UAE)":
        df["Country"][x] = country[x].title()
    

In [734]:
names = df["Name"].tolist()
for n in range(len(names)):
    names[n] = names[n].strip()
    if re.search("male*",names[n]) or re.search("boy.*",names[n]) or re.search("nan$",names[n]):
        df["Name"][n] = "Unknown"
    elif re.search("Occupant:+",names[n]):
        names[n] = re.sub("Occupant: ","",names[n])
        names[n] = re.findall("[A-Z][a-z]+\s[A-Z][a-z]+",names[n])
        df["Name"][n] = names[n]
    else:
        df["Name"][n] = "Unknown"

In [735]:
######################################################### SURVIVAL ###################################################

survival = df["Fatal (Y/N)"]
serie = F.Fatality(survival)
            


In [736]:
################################################### TIME #############################################################

#check unique values to unify time information

for t in range(0,len(df["Time"])):
    time = df["Time"][t]
#change values in %%h%%m format
    if re.search("\d\d+",time):
        split = (re.split("h",time))
        if split[0] < "12":
            df["Time"][t] = "Morning"
        elif split[0]>"12" and split[0]<"16":
            df["Time"][t] = "Afternoon"
        elif split[0]>"16" and split[0]<"20":
            df["Time"][t]="Evening"
        else:
            df["Time"][t] = "Night"    
#change other formats
    Morning = ["Dawn","AM","A.M.","daybreak","morning"]
    Afternoon = ["After","Midday","noon","lunch","Daytime"]
    Evening = ["P.M.","PM","Evening","dusk","sunset","sundown"]
    Night = ["night","Dark"]
    for m in Morning:
        F.find_Time(df,"Time",time,t,m,"Morning")
    for a in Afternoon:
        F.find_Time(df,"Time",time,t,a,"Afternoon")
    for e in Evening:
        F.find_Time(df,"Time",time,t,e,"Evening")        
    for n in Night:
        F.find_Time(df,"Time",time,t,n,"Night")                             
                        
    unknown = ["x","0","--","\xa0","  ","   "," "]
    for u in unknown:
        if re.match("(?i)"+u,time):
            F.find_Time(df,"Time",time,t,u,"Unknown")

    



In [737]:
################################################## SPECIES ###########################################################
# this column can be removed due to low amount of information
print("\n",df["Species "].value_counts())
df = df.drop("Species ",axis=1)
# pdf column can be removed because of its unusefullnes
df = df.drop("pdf",axis=1)
# age column can also be removed due to the lack of information
print("\n",df["Age"].value_counts())
df = df.drop("Age",axis=1)




 nan                                                                     2934
White shark                                                              161
Shark involvement not confirmed                                           80
Tiger shark                                                               68
Bull shark                                                                52
                                                                        ... 
Blue pointer, 16'                                                          1
White shark, 3 m [10']; identifed by Dr. W.I. Follett on tooth marks       1
Zambesi shark, 4'9"                                                        1
Thought to involve a cookie cutter shark                                   1
Blue or porbeagle shark                                                    1
Name: Species , Length: 1539, dtype: int64

 nan         2681
17           148
18           145
19           138
20           136
            ... 
87  

In [738]:
sex = df["Sex "]
sex.value_counts()
for s in range(0,len(sex)):
    if re.search("M ",sex[s]):
        df["Sex "][s] = "M"
    elif re.search("[(nan)(lli)(.)(N)]",sex[s]):
        df["Sex "][s] = "Unknown"
        

In [739]:
area = df["Area"]
location = df["Location"]
for a in range(len(area)):
    if re.search("nan*",area[a]):
        df["Area"][a] = "Unknown"
    if re.search("nan*",location[a]):
        df["Location"][a] = "Unknown"


In [700]:
df = df[["Country", "Area", "Year","Case Id","Type","Fatal (Y/N)","Time", "Sex ","Name","Activity","Injury","Location","Investigator or Source","href"]]
df.set_index(["Country", "Area", "Year"], inplace = True, drop = True)



In [None]:
# buscar pais, area año hpra del dia con mas ataques y mortalidad

In [673]:
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Case Id,Type,Fatal (Y/N),Time,Sex,Name,Activity,Injury,Location,Investigator or Source,href
Country,Area,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
USA,Florida,2016,0,Unprovoked,N,Afternoon,M,Unknown,Surfing,Minor injury to thigh,Unknown,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
USA,Florida,2016,1,Unprovoked,N,Morning,M,Unknown,Surfing,Lacerations to hands,Unknown,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
USA,Florida,2016,2,Unprovoked,N,Morning,M,Unknown,Surfing,Lacerations to lower leg,Unknown,"Orlando Sentinel, 9/19/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
Australia,Victoria,2016,3,Unprovoked,N,,M,Unknown,Surfing,Struck by fin on chest & leg,Thirteenth Beach,"The Age, 9/18/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
Australia,Victoria,2016,4,Unprovoked,N,,M,Unknown,Surfing,No injury: Knocked off board by shark,Bells Beach,"The Age, 9/16/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
Australia,Western Australia,2016,5,Unprovoked,N,,Unknown,[Ben Stratton],Fishing,Shark rammed boat. No injury to occupant,Bunbury,"West Australian, 9/15/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
USA,Florida,2016,6,Unprovoked,N,Afternoon,M,Unknown,Swimming,Minor injury to arm,"Ponte Vedra, St. Johns County","News4Jax, 9/11/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
USA,Hawaii,2016,7,Unprovoked,N,Afternoon,F,Unknown,Swimming,Severe lacerations to shoulder & forearm,"Makaha, Oahu","Hawaii News Now, 9/7/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
New Caledonia,North Province,2016,8,Unprovoked,Y,Afternoon,M,Unknown,Water sports,FATAL,Koumac,"TVANouvelles, 9/6/2016",http://sharkattackfile.net/spreadsheets/pdf_di...
USA,Unknown,2016,9,Unprovoked,N,Afternoon,F,Unknown,Water sports,Lacerations & punctures to lower right leg,"Kingston Plantation, Myrtle Beach, Horry County","C. Creswell, GSAF",http://sharkattackfile.net/spreadsheets/pdf_di...
