------------------
# Data Cleaning
-----------------


In [1]:
# Install to run code in notebook

!pip install plotly.express
!pip install meteostat



In [3]:
# Libraries used for Data Cleaning
import pandas as pd
import re

In [6]:
df = pd.read_excel("../data/raw/GSAF5.xls")
df.head()

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,14th October,2025.0,Unprovoked,Columbia,"Bolivar, del Isolate",Catagena Province,Swimming with sharks,Male child,M,14,...,Nurse shark,Kevin McMurray Trackingsharks.com Andy Currie,,,,,,,,
1,11th October,2025.0,Unprovoked,Australia,Queensland,Cook Esplanade Thursday Island,Fishing/swimming,Samuel Nai,M,14,...,Tiger or Bull shark,Kevin McMurray Trackingsharks.com,,,,,,,,
2,7th October,2025.0,Unprovoked,Australia,South Australia,Kangaroo Island,Surfing,Lee Berryman,M,50+,...,Bronze whaler?,Kevin McMurray Trackingsharks.com,,,,,,,,
3,29th September,2025.0,Unprovoked,USA,Off California,Catalina Island,Swimming,Christopher Murray,M,54,...,unknown 1.2m shark,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,
4,27th September,2025.0,Provoked,Costa Rica,,Cocos Islands,Diving-Tagging sharks,Dr. Mauricio Hoyos,M,48,...,Tiger shark 4m,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,


## State and Location Cleaning

In [7]:
df['State'].value_counts() 

State
Florida                                  1192
New South Wales                           522
Queensland                                353
Hawaii                                    344
California                                325
                                         ... 
Chatham Islands, east of New  Zealand       1
Clearwater Bay                              1
Aichi Prefecture                            1
Tafea Province                              1
Moala Island                                1
Name: count, Length: 939, dtype: int64

In [8]:
df['Location'].value_counts() 

Location
New Smyrna Beach, Volusia County       191
Daytona Beach, Volusia County           31
Ponce Inlet, Volusia County             28
Myrtle Beach, Horry County              22
Melbourne Beach, Brevard County         20
                                      ... 
Punaluu                                  1
Te Arai Point                            1
Kalihiwai Beach, Kauai                   1
5 nm off Cervantes                       1
Below the English fort, Trincomalee      1
Name: count, Length: 4598, dtype: int64

In [9]:
#Must Clean the messy geographical text inluded in the Location and State Column

# Fill missing values
df['Location'] = df['Location'].fillna('Unknown')
# Replace empty states with Unknown
df['State'] = df['State'].replace('', 'Unknown')


# Regex pattern to remove numbers and units
pattern = r'\b(\d+|miles|mi|km|kilometers)\b'

unwanted_words = ['miles', 'mi', 'km', 'kilometers', 'islands', 'near','nm', 'off', 
                  'below', 'the', 'of', 'east of', 'west of', 'south of',
                   'between', '&' , 'north of', 'ºN', 'ºS', 'ºW', 'ºE']                    
pattern = r'\b(?:' + '|'.join(unwanted_words) + r'|\d+)\b'

# Clean the Location column
df['location_clean'] = (
    df['Location']
    .str.replace(pattern, '', regex=True, flags=re.IGNORECASE)  # remove numbers & units
    .str.replace(r'\s+', ' ', regex=True)  # remove extra spaces
    .str.strip()
)

df['state_clean'] = (
    df['State']
    .str.replace(pattern, '', regex=True, flags=re.IGNORECASE)  # remove numbers & units
    .str.replace(r'\s+', ' ', regex=True)  # remove extra spaces
    .str.strip()
)
df['location_clean'] = df['location_clean'].replace('', 'Unknown').fillna('Unknown')

df['state_clean'] = df['state_clean'].replace('', 'Unknown').fillna('Unknown')


# Print the result
print(df[['Location', 'State', 'location_clean', 'state_clean']])


                                 Location                  State  \
0                       Catagena Province   Bolivar, del Isolate   
1          Cook Esplanade Thursday Island             Queensland   
2                         Kangaroo Island        South Australia   
3                         Catalina Island         Off California   
4                           Cocos Islands                    NaN   
...                                   ...                    ...   
7045                          Roebuck Bay      Western Australia   
7046                              Unknown      Western Australia   
7047                       Ocracoke Inlet         North Carolina   
7048                 Panama Bay 8ºN, 79ºW                    NaN   
7049  Below the English fort, Trincomalee       Eastern Province   

                      location_clean           state_clean  
0                  Catagena Province  Bolivar, del Isolate  
1     Cook Esplanade Thursday Island            Queensland  


In [9]:
df['state_clean'].value_counts() 

state_clean
Florida                 1193
New South Wales          524
Unknown                  486
Queensland               354
Hawaii                   346
                        ... 
Aichi Prefecture           1
Tafea Province             1
Kagoshima Prefecture       1
Costa Blanca               1
Moala Island               1
Name: count, Length: 891, dtype: int64

In [10]:
#summary for categorical variables
df[['state_clean', 'State']].describe()


Unnamed: 0,state_clean,State
count,7050,6564
unique,891,939
top,Florida,Florida
freq,1193,1192
