In [1]:
import pandas as pd

df = pd.read_excel('GSAF5.xls')

In [2]:
# Divya's work
# Cleaning up 'activity' column in the dataset

df['Activity'].nunique() #unique values in the activity column

1608

In [3]:
df['activity_clean'] = (
    df['Activity']
    .str.lower() #lowercase
    .str.strip() #remove spaces
    .str.replace(r'[^a-z\s]', '', regex=True)  #remove punctuation/numbers
)

In [45]:
import numpy as np

def categorize_activity(text):
    if pd.isna(text): #handling 'NaN' values and labelling them as unknown
        return "Unknown"
    if any(word in text for word in ["surf", "bodyboard", "paddle", "boogie", "body boarding", "kiteboarding", "foilboarding", "skimboarding", "wakeboarding"]): #if the word is present in the text, then loop over each word and replace with return 'string'
        return "Surfing"
    elif any(word in text for word in ["swim", "bathing", "snorkel", "rescue", "float", "splash", "swimming"]):
        return "Swimming"
    elif any(word in text for word in ["fish", "spearfish", "net", "catch", "line", "fishing", "spear", "scalloping", "lobstering", "hunt", "clamming"]):
        return "Fishing"
    elif any(word in text for word in ["dive", "scuba", "freediv", "underwater", "research", "investigat", "pearl", "recover", "diving"]): #'investigat' covers terms like investigating / investigation / investigator..
        return "Diving"
    elif any(word in text for word in ["boat", "kayak", "sail", "ship", "vessel", "frigate", "dinghy", "canoe", "race", "compet", "rowing", "watercraft", "jet ski", "paddling", "sculling", "raft", "yacht"]):
        return "Boating"
    elif any(word in text for word in ["walk", "stand", "wade", "reef", "shore", "beach", "adrift", "wading", "tread"]):
        return "Wading"
    elif any(word in text for word in ["sea disaster", "aircraft", "boeing", "wreck", "hurricane", "tsunami", "earthquake", "disaster", "plunged", "sank", "destroyed", "overboard", "suicide", "air", "petting", "capsize", "swept", "help", "ride", "sunk", "went down", "crash", "sinking"]):
        return "Catastrophe"
    else:
        return "Other Activity"

df['Activity_group'] = df['activity_clean'].apply(categorize_activity) #new column 'activity group' for categorised activity


In [5]:
print("Original:", df['Activity'].nunique()) #original dataset
print("Cleaned:", df['activity_clean'].nunique()) #activity column after removing spaces, fixing inconsistent letter cases and removing punctutations
print("Grouped:", df['Activity_group'].nunique()) #after categorising all the activity under specific keywords

Original: 1608
Cleaned: 1522
Grouped: 9


In [6]:
df[df['Activity_group'] == 'Other Activity']['activity_clean'].sample(40, random_state=42).tolist()
#sample of 'other' category to improve mapping and grouping

['baiting sharks',
 'unknown but it was said to be the first known attack in sydney harbour',
 'attempting to fix motor',
 'sitting in shallows',
 'sitting in the water',
 'washing cooking pans',
 'human head found in shark caught by british steamer syria',
 'playing in the water',
 'pulling anchor',
 'escaping from alacatraz',
 'measuring sharks',
 'filming',
 'washing his feet',
 'stamding',
 'masted steel barque glenbank foundered during a cyclone',
 'filming  blue shark',
 'lying prone in  of water',
 'waterskiing',
 'kite boarding',
 'dangling feet in the water',
 'jumped into the water',
 'attempting to lasso sharks tail',
 'hms victoria collided with the hms camperdown',
 'escaping from alacatraz',
 'on a roundtheworld expedition',
 'attempting to illegally enter the usa',
 'inebriated woke from sleep and fell off deck into the water',
 'fell into the water',
 'sitting in shallows',
 'hbm magpie foundered in a squall',
 'feeding mullet to sharks',
 'parachuted from balloon',
 't

In [46]:
df['Activity_group'].value_counts()

Activity_group
Swimming          1667
Surfing           1637
Fishing           1368
Unknown            585
Diving             568
Wading             387
Boating            332
Other Activity     332
Catastrophe        174
Name: count, dtype: int64

In [8]:
#Cleaning and formatting the 'date' column and extracting 'month' from it
df["Date"].head(30)

0            14th October
1            11th October
2             7th October
3          29th September
4          27th September
5           6th September
6           1st September
7             30th August
8             18th August
9             17th August
10            16th August
11             7th August
12             1st August
13              28th July
14              25th July
15              22nd July
16              20th July
17              19th July
18              18th July
19              15th July
20              6th July 
21               6th July
22               4th July
23              29th June
24              25th June
25              22nd June
26              17th June
27    2025-06-11 00:00:00
28               31st May
29    2025-05-29 00:00:00
Name: Date, dtype: object

In [9]:
df["Date"].tail(30)

7020                                         World War II
7021                                          Before 1905
7022                              A few years before 1938
7023                                              No date
7024                                          Early 1930s
7025                                          Before 1927
7026                                  Between 1918 & 1939
7027                                              No date
7028                                              No date
7029                                              No date
7030                                           1920 -1923
7031                                          Before 1921
7032                                          Before 1911
7033                                          Before 1921
7034                                          Before 1921
7035                                          Before 1917
7036                                   Before 17-Jul-1916
7037    No dat

In [10]:
df['Date'] = df['Date'].astype(str)

In [11]:
import numpy as np

def extract_month_or_nan(value):
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    value_str = str(value)
    for month in months:
        if month in value_str:
            return month
    return np.nan  #assign NaN if no month found

df['Extracted_Month_NaN'] = df['Date'].apply(extract_month_or_nan)

count_months = df['Extracted_Month_NaN'].count() #count extracted months (non-null values)
print(f'Number of extracted months: {count_months}')

count_nan = df['Extracted_Month_NaN'].isna().sum() #count how many NaN values were assigned
print(f'Number of non-month values: {count_nan}')

Number of extracted months: 6449
Number of non-month values: 601


In [12]:
df['Extracted_Month_NaN'].value_counts()

Extracted_Month_NaN
Jul    784
Aug    673
Sep    611
Jan    566
Jun    549
Oct    503
Apr    493
Dec    491
Mar    458
May    452
Nov    451
Feb    418
Name: count, dtype: int64

In [13]:
def get_season(month):
    if month in ["Dec", "Jan", "Feb"]:
        return "Winter"
    elif month in ["Mar", "Apr", "May"]:
        return "Spring"
    elif month in ["Jun", "Jul", "Aug"]:
        return "Summer"
    elif month in ["Sep", "Oct", "Nov"]:
        return "Autumn"
    else:
        return np.nan

df["Season"] = df["Extracted_Month_NaN"].apply(get_season) #applying the season assigning as a column to the dataframe

In [14]:
df['Extracted_Month_NaN'].value_counts()

Extracted_Month_NaN
Jul    784
Aug    673
Sep    611
Jan    566
Jun    549
Oct    503
Apr    493
Dec    491
Mar    458
May    452
Nov    451
Feb    418
Name: count, dtype: int64

In [15]:
df.to_csv("../data/interim/shark_attacks_cleaned.csv", index=False)

#folder needs to be created


OSError: Cannot save file into a non-existent directory: '../data/interim'

## Summary Statistics

In [None]:
#summary for categorical variables
df[['Activity_group', 'Season']].describe()


Unnamed: 0,Activity_group,Season
count,7050,6449
unique,9,4
top,Swimming,Summer
freq,1667,2006


- **Swimming** is the top most activity, that's associated with shark attacks
- **Summer** is the season where most of the shark attacks happen

In [None]:
#summary for categorical variable - 'Activity_group'
df['Activity_group'].value_counts()

Activity_group
Swimming          1667
Surfing           1637
Fishing           1368
unknown            585
Diving             568
Wading             387
Boating            332
Other Activity     332
Catastrophe        174
Name: count, dtype: int64

In [None]:
#summary for categorical variable - 'Season'
df['Season'].value_counts()

Season
Summer    2006
Autumn    1565
Winter    1475
Spring    1403
Name: count, dtype: int64

## Visualizations

### Univariate Analysis

In [None]:
import plotly.express as px

#### 1. Activity groups related to shark attacks

In [47]:
#Attack per activity group
activity_counts = df['Activity_group'].value_counts().reset_index()
activity_counts.columns = ['Activity Group', 'Number of Attacks'] #convert into a dataframe because plotly needs a dataframe as input

#create an interactive bar chart
fig = px.bar(activity_counts,
             x='Activity Group',
             y='Number of Attacks',
             color='Activity Group',
             title='Number of Shark Attacks by Activity',
             text='Number of Attacks') #adds title for each bar
fig.update_traces(textposition='outside') #bar titles outside the bars
fig.update_layout(xaxis_title='Activity Group',
                  yaxis_title='Number of Attacks',
                  showlegend=False)
fig.update_layout(margin=dict(t=100, b=40, l=60, r=40))
fig.show()


#### 2. Seasons corresponding to shark attacks

In [None]:
#Number of attacks per season
season_counts = df['Season'].value_counts().reset_index()
season_counts.columns = ['Season', 'Number of Attacks']

#create an interactive bar chart
fig = px.bar(season_counts,
             x='Season',
             y='Number of Attacks',
             color='Season',
             title='Number of Shark Attacks by Season',
             text='Number of Attacks')

#Layout adjustments
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Season',
                  yaxis_title='Number of Attacks',
                  showlegend=False,
                  margin=dict(t=100, b=40, l=60, r=40))
fig.show()

### Bivariate Analysis

#### 1. Relationship between activity group and victim fatality

In [18]:
df["Fatal Y/N"].unique()

array(['N', 'Y', 'F', 'M', nan, 'n', 'Nq', 'UNKNOWN', 2017, 'Y x 2', ' N',
       'N ', 'y'], dtype=object)

In [21]:
#Fatal Y/N column cleaning

df['Fatal Y/N'] = df['Fatal Y/N'].astype(str).str.strip().str.upper()

df['Fatal Y/N'] = df['Fatal Y/N'].replace({
    'UNKNOWN': 'N',  
    'F': 'N',
    'M': 'N',
    'NAN': 'N',
    'NQ': 'N',
    '2017': 'N',
    'Y X 2': 'Y',
})

In [22]:
df["Fatal Y/N"].value_counts()

Fatal Y/N
N    5566
Y    1484
Name: count, dtype: int64

In [23]:
df['Fatal Y/N'] = df['Fatal Y/N'].replace({
    'N': 'Survived',  
    'Y': 'Did not survive'})

In [24]:
df["Fatal Y/N"].value_counts()

Fatal Y/N
Survived           5566
Did not survive    1484
Name: count, dtype: int64

In [48]:
import plotly.express as px
import pandas as pd

# Calculate proportions
activity_fatal = (
    df.groupby("Activity_group")["Fatal Y/N"]
    .value_counts(normalize=True)
    .rename("Proportion")
    .reset_index()
)

# Extract only fatal proportions for sorting
fatal_order = (
    activity_fatal[activity_fatal["Fatal Y/N"] == "Did not survive"]
    .sort_values("Proportion", ascending=False)["Activity_group"]
)

# Create the bar chart, sorted by fatality proportion
fig = px.bar(
    activity_fatal,
    x="Activity_group",
    y="Proportion",
    color="Fatal Y/N",
    barmode="stack",
    category_orders={"Activity_group": fatal_order},
    text=activity_fatal["Proportion"].apply(lambda x: f"{x:.0%}"),
    title="Proportion of Fatal vs Survived Incidents by Activity Group (Sorted by Fatality Rate)",
    labels={
        "Activity_group": "Activity Group",
        "Proportion": "Proportion",
        "Fatal Y/N": "Outcome",
    },
)

# Layout adjustments
fig.update_traces(textposition="inside")
fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_tickformat=".0%",
    legend_title="Fatality Outcome",
    bargap=0.25,
)
fig.show()


- About 65% of the victims of **Catastrophes** like aircraft accidents over seas, drowning, Tsunamis, Floods and Hurricanes did not survive.
- **Boating** is activity that resulted in the second most fatality.
- **Surfing** is the activity that resulted in the least amount of fatal incidents.

#### 2. Relationship between Activity group and Season of the attack

In [51]:
fig = px.density_heatmap(
    heatmap_data,
    x='Season',
    y='Activity_group',
    z='Count',
    category_orders={'Season': season_order},
    color_continuous_scale='Blues',
    title='Activities related to shark attack across Seasons',
    labels={'Activity_group': 'Activity', 'Season': 'Season', 'Count': 'Number of Attacks'}
)

# Add annotations
for i, row in heatmap_data.iterrows():
    fig.add_annotation(
        x=row['Season'],
        y=row['Activity_group'],
        text=str(int(row['Count'])),
        showarrow=False,
        font=dict(color='black', size=10)
    )

fig.show()

- **Swimming** is the activity that resulted in the most shark attacks during **Summer**
- **Surfing** is activity that resulted in the most shark attacks during **Autumn**