In [1]:
import pandas as pd
import SRC.function as F
import re

df = pd.read_csv("INPUT/GSAF5.csv", encoding = "cp1252")
df.head()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.c,2016.09.18.c,5993,,
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.b,2016.09.18.b,5992,,
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,...,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.18.a,2016.09.18.a,5991,,
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,...,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.17,2016.09.17,5990,,
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,...,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2016.09.16,2016.09.15,5989,,


In [2]:
# First we check all the columns and the data type of each one of them

print(df.dtypes)

Case Number               object
Date                      object
Year                       int64
Type                      object
Country                   object
Area                      object
Location                  object
Activity                  object
Name                      object
Sex                       object
Age                       object
Injury                    object
Fatal (Y/N)               object
Time                      object
Species                   object
Investigator or Source    object
pdf                       object
href formula              object
href                      object
Case Number.1             object
Case Number.2             object
original order             int64
Unnamed: 22               object
Unnamed: 23               object
dtype: object


In [3]:
# Remove columns with NaN values for all or almost all elements

null_cols = df.isnull().sum()
dim = df.shape
print("\nTotal number of values: ",dim[0])
print("\nTotal count of NaN values: \n\n",null_cols)
# we remove columns with at least 90% of NaN

drop_cols = list(null_cols[null_cols > df.shape[0]*0.9].index)
print("\nColumns to drop: \n",*drop_cols)
df = df.drop(drop_cols,axis=1)

# we can also delete columns "Case Number.1" and "Case Number.2", as they are very similar to "Case Number" column
                           
                        
drop1 = sum(df["Case Number"]==df["Case Number.1"])
drop2 = sum(df["Case Number"]==df["Case Number.2"])
                           
print("\nCommon elements between Case Number and Case Number.1: {} ({}%)".format(drop1,round(drop1*100/dim[0],2)))
print("Common elements between Case Number and Case Number.2: {} ({}%)".format(drop2,round(drop2*100/dim[0],2)))

df_new = df.drop(["Case Number.1","Case Number.2"],axis=1)

# because "href" and "href formula" are the same, one of them can also be removed

drop3 = sum(df["href"]==df["href formula"])
                           
print("Common elements between href and href formula: {} ({}%)".format(drop3,round(drop3*100/dim[0],2)))
df_new = df_new.drop(["href formula"], axis=1)
                           


Total number of values:  5992

Total count of NaN values: 

 Case Number                  0
Date                         0
Year                         0
Type                         0
Country                     43
Area                       402
Location                   496
Activity                   527
Name                       200
Sex                        567
Age                       2681
Injury                      27
Fatal (Y/N)                 19
Time                      3213
Species                   2934
Investigator or Source      15
pdf                          0
href formula                 1
href                         3
Case Number.1                0
Case Number.2                0
original order               0
Unnamed: 22               5991
Unnamed: 23               5990
dtype: int64

Columns to drop: 
 Unnamed: 22 Unnamed: 23

Common elements between Case Number and Case Number.1: 5979 (99.78%)
Common elements between Case Number and Case Number.2: 5990 (99.97%

In [4]:
# we can check if there are any rows with a high amount of NaN

df_rows = df_new.T
null_rows = df_rows.isnull().sum()
print(max(null_rows))

# no rows can be removed as they all provide a significant amount of information

df_new.head()

9


Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href,original order
0,2016.09.18.c,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,16.0,Minor injury to thigh,N,13h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.c-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5993
1,2016.09.18.b,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,Chucky Luciano,M,36.0,Lacerations to hands,N,11h00,,"Orlando Sentinel, 9/19/2016",2016.09.18.b-Luciano.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5992
2,2016.09.18.a,18-Sep-16,2016,Unprovoked,USA,Florida,"New Smyrna Beach, Volusia County",Surfing,male,M,43.0,Lacerations to lower leg,N,10h43,,"Orlando Sentinel, 9/19/2016",2016.09.18.a-NSB.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5991
3,2016.09.17,17-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Thirteenth Beach,Surfing,Rory Angiolella,M,,Struck by fin on chest & leg,N,,,"The Age, 9/18/2016",2016.09.17-Angiolella.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5990
4,2016.09.15,16-Sep-16,2016,Unprovoked,AUSTRALIA,Victoria,Bells Beach,Surfing,male,M,,No injury: Knocked off board by shark,N,,2 m shark,"The Age, 9/16/2016",2016.09.16-BellsBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,5989


In [6]:
# performing different analysis we can improve the table using different methods:

#print(df_new["Type"].value_counts())
F.renameF(df_new,"Type","Boat","Boating")


df_new = df_new.fillna("0")
F.renameF(df_new,"Activity", "Kitesurfing","Kite Surfing")





e = df_new["Activity"].values
x = []
for i in e:
    if re.search("surf",i):
        x.append(i)
print(x)




['Kite surfing', 'Kite surfing', 'Body surfing', 'Kite surfing', 'Windsurfing', 'Body surfing?', 'Bodysurfing', 'Body surfing or Boogie boarding', 'Body surfing', 'Body surfing', 'Body surfing', 'Walking in surf', 'Body surfing', 'Playing in the surf', 'Windsurfing', 'Body surfing', 'Attempting to Kite surf from Egypt to Saudi Arabia', 'Playing in the surf', 'Playing in the surf', 'Body surfing', 'Windsurfing', 'Paddle-surfing', 'Diving, but on the surface when bitten by the shark', 'Swimming, towing surfboard', 'Swimming or surfing', 'Body surfing', 'Body surfing', 'Body surfing', 'Walking out of the water after surfing', 'Fishing from surfski', 'Walking out of the water after surfing', 'Walking out of the water after surfing', 'Body surfing', 'Scurfing (surfboard being  towed behind a boat)', 'Tandem surfing', 'Swimming or surfing', 'Sitting on surfboard', 'Sitting on surfboard', 'Walking, carrying surfboard & stepped on shark', 'Sitting on surfboard', 'Playing in the surf with his 2

In [None]:
df_new["Date"].value_counts()

#con una regex cambiar el año al primer digito para ordernar

In [None]:
# We substitute all NaN values by 0 in order to make the analysis easier

df = df.fillna(0)
