----------------------------------------------------------------
 DATA ANALYSIS AND VISUALIZATION
----------------------------------------------------------------
----------------------------------------------------------------

### LIBRARIES NEEDED

In [6]:
# Install to run code in notebook

!pip install plotly.express
!pip install meteostat



In [7]:
# Libraries used for data analysis and visualization
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd


# Visualizations/Analysis

In [8]:
df = pd.read_excel('../data/raw/GSAF5.xls')
df.head()

Unnamed: 0,Date,Year,Type,Country,State,Location,Activity,Name,Sex,Age,...,Species,Source,pdf,href formula,href,Case Number,Case Number.1,original order,Unnamed: 21,Unnamed: 22
0,14th October,2025.0,Unprovoked,Columbia,"Bolivar, del Isolate",Catagena Province,Swimming with sharks,Male child,M,14,...,Nurse shark,Kevin McMurray Trackingsharks.com Andy Currie,,,,,,,,
1,11th October,2025.0,Unprovoked,Australia,Queensland,Cook Esplanade Thursday Island,Fishing/swimming,Samuel Nai,M,14,...,Tiger or Bull shark,Kevin McMurray Trackingsharks.com,,,,,,,,
2,7th October,2025.0,Unprovoked,Australia,South Australia,Kangaroo Island,Surfing,Lee Berryman,M,50+,...,Bronze whaler?,Kevin McMurray Trackingsharks.com,,,,,,,,
3,29th September,2025.0,Unprovoked,USA,Off California,Catalina Island,Swimming,Christopher Murray,M,54,...,unknown 1.2m shark,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,
4,27th September,2025.0,Provoked,Costa Rica,,Cocos Islands,Diving-Tagging sharks,Dr. Mauricio Hoyos,M,48,...,Tiger shark 4m,Todd Smith: Kevin McMurray Trackingsharks.com,,,,,,,,


## Injury Analysis

In [9]:
# Injury Class Distribution Analysis

df['Injury_Class'] = df['Injury'].apply(classify_injury)
from utils import classify_injury

# Count the injury classes
injury_counts = df['Injury_Class'].value_counts().reset_index()
injury_counts.columns = ['Injury_Class', 'Count']


fig = px.bar(injury_counts,
             x='Count',
             y='Injury_Class',
             color='Count',
             color_continuous_scale='Reds',
             title='Injury Class Distribution',)
fig.update_traces(textposition='outside')  # move labels above bars
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()
# Show the chart

In [10]:
# - We can see most reports are Severe wounds (~3.2k), followed by Fatal wounds (~1.6k) and 
# about ~1.8k were minor and/or no wounds at all.
# - This shows that more then 50% of the reports resulted in severe or fatal wounds.

--------------------------------

## Fatal Analysis

In [11]:
# Fatal Cases Analysis

from utils import clean_fatal_column

# Clean the Fatal column first
df = clean_fatal_column(df)

# Count fatal vs non-fatal
injury_counts = df['Fatal Y/N'].value_counts().reset_index()
injury_counts.columns = ['Fatal Y/N', 'Count']

# Plot
fig = px.bar(
    injury_counts,
    x='Count',
    y='Fatal Y/N',
    color='Count',
    color_continuous_scale='Reds',
    title='Fatal Cases "Yes/No"'
)

fig.update_traces(
    text=injury_counts['Count'],
    textposition='outside'  # move labels outside the bars
)

fig.show()


In [12]:
# - With Fatal Cases chart we can say that about 75% of the cases were not fatal
# while about 15% were fatal and about 10% are unknown.
# - Altough this seems good, we must be alert that about 3k non fatal cases from the 5k total, were also 
# reported as severe wounds.

-------------------------------------------------

# Relationship between Fatal Y/N and Injury_Class

In [13]:
# HEATMAP: Injury Class vs Fatal Y/N

from utils import classify_injury
from utils import clean_fatal_column

heatmap_data = (
    df.groupby(['Injury_Class', 'Fatal Y/N'])
      .size()
      .reset_index(name='Count')
)

fig = px.density_heatmap(
    heatmap_data,
    x='Fatal Y/N',
    y='Injury_Class',
    z='Count',
    color_continuous_scale='Reds',  # Red gradient for columns
    title='Relationship Between Injury Class and Fatality',
    labels={
        'Injury_Class': 'Injury Class',
        'Fatal_Y/N': 'Fatal Y/N',
        'Count': 'Number of Cases'
    }
)

# Adding count annotations on top of each column
for i, row in heatmap_data.iterrows():
    fig.add_annotation(
        x=row['Fatal Y/N'],
        y=row['Injury_Class'],
        text=str(int(row['Count'])),
        showarrow=False,
        font=dict(color='black', size=10)
    )

# Final layout polish
fig.update_layout(
    template='plotly_white',
    xaxis_title='Fatal Y/N',
    yaxis_title='Injury Class',
    coloraxis_colorbar=dict(title='Number of Cases'),
)

fig.show()

In [14]:
# - This heatmap shows the relationship between injury severity and fatality.
# - Why do we have "No in Fatal Wounds"? Possibly due to ambiguous text, data entry errors, 
# or keyword rules catching words like “near-fatal” or “almost fatal” as we defined the word "fatal" as Fatal Wounds.
# - If we have a difference of 214 cases between Fatal Wounds and Yes in Fatal Wounds,
# it means that there are 214 cases that were classified as Fatal Wounds but were not
# reported as fatal in the Fatal Y/N column.
# - We can assume that we have a total of 1407 real fatal cases in Fatality wounds.
# - This also happens in other Injury Classes but with smaller differences, which means this heatmap is useful
#  to spot potential data inconsistencies.
# - Should we clean these inconsistencies out of the dataset or can we assume they are not needed for our app?

What percentage of ‘Severe Wounds’ were actually fatal then?

In [15]:
heatmap_prop = pd.crosstab(df['Injury_Class'], df['Fatal Y/N'], normalize='index') * 100

#  Reset index and prepare data for Plotly
heatmap_prop = heatmap_prop.reset_index().melt(id_vars='Injury_Class', value_name='Percentage')

#  Create the heatmap
fig = px.density_heatmap(
    heatmap_prop,
    x='Fatal Y/N',
    y='Injury_Class',
    z='Percentage',
    color_continuous_scale='Reds',
    title='Heatmap (%): Relationship Between Injury Class and Fatality',
    labels={
        'Injury_Class': 'Injury Class',
        'Fatal Y/N': 'Fatal Y/N',
        'Percentage': 'Percentage of Cases'
    }
)

#  Add text annotations (showing % values)
for i, row in heatmap_prop.iterrows():
    fig.add_annotation(
        x=row['Fatal Y/N'],
        y=row['Injury_Class'],
        text=f"{row['Percentage']:.1f}%",
        showarrow=False,
        font=dict(color='black', size=10)
    )

# Layout polish
fig.update_layout(
    template='plotly_white',
    xaxis_title='Fatal Y/N',
    yaxis_title='Injury Class',
    coloraxis_colorbar=dict(title='Percentage'),
)

fig.show()

In [16]:
# - The percetage heatmap confirms our previous assumptions.
# - The ‘Fatal Wounds’ category aligns strongly with fatal outcomes, while ‘Minor’ and 
# ‘No Injury’ classes are almost entirely non-fatal.