In [286]:
import pandas as pd
import SRC.function as F
import re
df = pd.read_csv("INPUT/GSAF5.csv", encoding = "cp1252")
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [287]:
# First we check all the columns and the data type of each one of them

print(df.dtypes)

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object


In [288]:
# Remove columns with NaN values for all or almost all elements

null_cols = df.isnull().sum()
dim = df.shape
print("\nTotal number of values: ",dim[0])
print("\nTotal count of NaN values: \n\n",null_cols)

# we remove columns with at least 90% of NaN

drop_cols = list(null_cols[null_cols > df.shape[0]*0.9].index)
print("\nColumns to drop: \n",*drop_cols)
df = df.drop(drop_cols,axis=1)



Total number of values:  5992

Total count of NaN values: 

 Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

Columns to drop: 
 Unnamed: 22 Unnamed: 23


In [289]:
# the next step is to remove similar columns
# we can also delete columns "Case Number.1" and "Case Number.2", as they are very similar to "Case Number" column
                             
drop1 = sum(df["Case Number"]==df["Case Number.1"])
drop2 = sum(df["Case Number"]==df["Case Number.2"])
                           
print("\nCommon elements between Case Number and Case Number.1: {} ({}%)".format(drop1,round(drop1*100/dim[0],2)))
print("Common elements between Case Number and Case Number.2: {} ({}%)".format(drop2,round(drop2*100/dim[0],2)))

df_new = df.drop(["Case Number.1","Case Number.2"],axis=1)

# because "href" and "href formula" are the same, one of them can also be removed

drop3 = sum(df["href"]==df["href formula"])
                           
print("Common elements between href and href formula: {} ({}%)".format(drop3,round(drop3*100/dim[0],2)))
df_new = df_new.drop(["href formula"], axis=1)


Common elements between Case Number and Case Number.1: 5979 (99.78%)
Common elements between Case Number and Case Number.2: 5990 (99.97%)
Common elements between href and href formula: 5938 (99.1%)


In [290]:
# we must check if there are any duplicate rows 

duplicates = df_new[df_new.duplicated()]
print(len(duplicates))

0


In [291]:
# we can check if there are any rows with a high amount of NaN

df_rows = df_new.T
null_rows = df_rows.isnull().sum()
print(max(null_rows))

print("no rows can be removed as they all provide a significant amount of information")

df_new.head(303)

9
no rows can be removed as they all provide a significant amount of information


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,2014.08.06,06-Aug-14,2014,Unprovoked,USA,South Carolina,"Folly Beach, Charleston County",Boogie boarding,Riley Harris,M,10,Lacerations to right leg & foot,N,14h00,4' tp 5' shark,"C. Creswell, GSAF; WCSC. 8/6/2014",2014.08.14-Harris.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5694
299,2014.08.05,05-Aug-14,2014,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Swimming,female,F,45,Lacerations to foot,N,,,"Inquisitr, 8/7/2014",2014.08.05-Tulip.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5693
300,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"South of Cocoa Beach, Brevard County",Surfing,male,M,50s,Foot bitten,N,,,"Florida Today, 8/8/2014",2014.08.08-CocoaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5692
301,2014.08.02,02-Aug-14,2014,Unprovoked,USA,Florida,"Table Beach, Brevard County",Boogie boarding,Christian Sanhueza,M,8,Laceration to ankle,N,13h00,,"Florida Today, 8/2/2014",2014.08.02-Sanhueza.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5691


In [294]:
################################################# CASE NUMBER ########################################################

date = list(df_new["Case Number"])
n = 0
c = 0

for i in range(0,len(date)):
    date[i] = re.sub("\.","/",date[i])
    date[i] = date[i][0:10]
    df_new["Case Number"][i] = date[i]    
for i in range(0,len(date)):   
    if date[i][5:10] == "00/00":
        df_new["Case Number"][i] = date[i][0:4]
    elif date[i][0:2]=="ND":
        n += 1
        df_new["Case Number"][i] = "Unidentified #{}".format(n)


##### falta poner casos mismo dia



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [164]:
# performing different analysis we can improve the table using different methods:
df_new = df_new.fillna("0")

print(df_new["Type"].value_counts())
F.renameF(df_new,"Type","Boat","Boating")

##################### solucionar


#F.renameF(df_new,"Activity", "Kite surfing","Kitesurfing")
#F.renameF(df_new,"Activity", "Body surfing?","Bodysurfing")


e = df_new["Activity"].tolist()
kite = F.findF(list(),"(?i) kit.+",e)
dive = F.findF(list(),"(?i) div.+",e)
swim = F.findF(list(),"(?i) swim.+",e)


    
#[F.renameF(df_new,"Activity",k,"Kitesurf") for k in kite]
#[F.renameF(df_new,"Activity",d,"Diving") for d in dive]3
#[F.renameF(df_new,"Activity",s,"Swimming") for s in swim]

#print(df_new["Activity"].value_counts())





Unprovoked      4386
Provoked         557
Invalid          519
Sea Disaster     220
Boat             200
Boating          110
Name: Type, dtype: int64


In [165]:
df = df_new
df.head()
names = df["Name"].tolist()
for n in names:
    if re.search("(?i) (\w\s\w)",n)==None:
        df["Name"] = df["Name"].replace(n,"Unknown")
    



In [166]:
######

#def Fatality(e):
#    if i.find("fatal") or i.find("FATAL") or i.find("Fatal"):
#        return("Fatal")
#    else:
#        return("Survived")

#df_2 = df.apply(Fatality, axis=1, result_type='expand')

#df_2["Survival"].value_counts()


In [41]:
################################################### TIME #############################################################

#check unique values to unify time information
#print(df_new["Time"].value_counts())

for t in range(0,len(df_new["Time"])):
    time = df_new["Time"][t]
#change values in %%h%%m format
    if re.search("\d\d+",time):
        split = (re.split("h",time))
        if split[0] < "12":
            df_new["Time"][t] = "Morning"
        elif split[0]>"12" and split[0]<"16":
            df_new["Time"][t] = "Afternoon"
        elif split[0]>"16" and split[0]<"20":
            df_new["Time"][t]="Evening"
        else:
            df_new["Time"][t] = "Night"    
#change other formats
    known = ["After","AM","A.M.","P.M.","PM","Dawn","night","Evening","dusk","Midday","sunset","noon","lunch","daybreak","morning","sundown","Dark","Daytime"]
    for s in known:
        if re.search("(?i)"+s,time):
            F.find_Time(df_new,"Time",time,t,s,"Afternoon")
            F.find_Time(df_new,"Time",time,t,s,"Morning")
            F.find_Time(df_new,"Time",time,t,s,"Morning")
            F.find_Time(df_new,"Time",time,t,s,"Morning")
            F.find_Time(df_new,"Time",time,t,s,"Evening")
            F.find_Time(df_new,"Time",time,t,s,"Evening")
            F.find_Time(df_new,"Time",time,t,s,"Morning")
            F.find_Time(df_new,"Time",time,t,s,"Night")
            F.find_Time(df_new,"Time",time,t,s,"Evening")
            F.find_Time(df_new,"Time",time,t,s,"Night")
            F.find_Time(df_new,"Time",time,t,s,"Afternoon")
            F.find_Time(df_new,"Time",time,t,s,"Evening")
            F.find_Time(df_new,"Time",time,t,s,"Afternoon")
            F.find_Time(df_new,"Time",time,t,s,"Afternoon")
            F.find_Time(df_new,"Time",time,t,s,"Evening")
            F.find_Time(df_new,"Time",time,t,s,"Night")
            F.find_Time(df_new,"Time",time,t,s,"Night")
            F.find_Time(df_new,"Time",time,t,s,"Afternoon")

    
print(df_new["Time"].value_counts())


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0            3213
Afternoon    1173
Morning       756
Night         487
Evening       352
--              5
                2
                1
X               1
                1
                1
Name: Time, dtype: int64


In [46]:
df_new["pdf"].value_counts()

1906.09.27.R.a&b-Munich-Swede.pdf      2
1934.12.23.a-b-Inman.pdf               2
1916.07.12.a-b-Stillwell-Fisher.pdf    2
1898.00.00.R-Syria.pdf                 2
1935.06.05.R-SolomonIslands.pdf        2
                                      ..
2011.05.07.R-UAE.pdf                   1
1910.03.31-Barnes.pdf                  1
1984.11.08-Monk.pdf                    1
1941.00.00.h-Munda.pdf                 1
2016.03.26-Kreckman.pdf                1
Name: pdf, Length: 5981, dtype: int64

Empty DataFrame
Columns: [Case Number, Date, Year, Type, Country, Area, Location, Activity, Name, Sex , Age, Injury, Fatal (Y/N), Time, Species , Investigator or Source, pdf, href, original order]
Index: []
