# Imports
- All library imports
- Original DataFrame import

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

shark_df = pd.read_excel('../shark-dataset.xls')
shark_df

### Column cleaning

In [None]:
unused_columns = ['type', 'state', 'name', 'location', 'species', 'source', 'pdf', 'href_formula', 'href', 'case_number', 'case_number.1', 'original_order', 'unnamed:_21', 'unnamed:_22', 'time', 'injury']

def clean_columns(df):
    df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_", regex=False) # lowercase col names, remove+replace empty spaces
    df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)
    df = df.drop(unused_columns, axis=1, errors='ignore')
    return df

shark_df = clean_columns(shark_df)
shark_df.tail(50)

### Country formatting

In [None]:
# # capitalize names in 'country' except for 'USA', handle two-word countries
# def country_formatting(df):
#     df['country'] = df['country'].apply(lambda x: 
#     ' '.join(word.capitalize() for word in x.split()) if x.lower() != 'usa' else x)
#     return df

# shark_df = country_formatting(shark_df)
# shark_df.head(50)

### Year filtering

In [None]:
start_year = 2014
end_year = 2024

def filter_years(df):
    df = df[(df['year'] >= start_year) & (shark_df['year'] <= end_year)]
    df['year'] = df['year'].fillna(0).astype(int) #convert float in year to int
    return df

shark_df = clean_columns(shark_df)
shark_df.head() 

## Years (Tung)

In [None]:
# reduce range rows for better valid inputs

start_year = 2014
end_year = 2024

shark_df = shark_df[(shark_df["year"] >= start_year) & (shark_df["year"] <= end_year)]

#convert float in year to int
shark_df["year"] = shark_df["year"].fillna(0).astype(int)

# remove invalid rows with "2014" as date
shark_df = shark_df.drop(shark_df.index[-2:])


total_count_spring = 298
total_count_summer = 405
total_count_autumn = 290
total_count_winter = 231

shark_df

## Date & Time (Tung)

In [None]:
# Function to parse different date formats
def parse_date(date_str):
    if isinstance(date_str, str):
        try: 
            return pd.to_datetime(date_str)  # Try direct conversion
        except ValueError:
            match = re.search(r'(\d{4}-\d{1,2}-\d{1,2}|\d{1,2}-[A-Za-z]{3}-\d{4}|\b[A-Za-z]{3}-\d{4}\b)', date_str)
            if match:
                date_str = match.group(0)
                try:
                    return datetime.strptime(date_str, "%Y-%m-%d")
                except ValueError:
                    try:
                        return datetime.strptime(date_str, "%d-%b-%Y")
                    except ValueError:
                        try:
                            return datetime.strptime(date_str, "%b-%Y")
                        except ValueError:
                            return None  # Return None for invalid formats
    elif isinstance(date_str, datetime):
        return date_str  # Return the datetime object as is
    return None  # Return None if not a string or datetime

# Create datetime_column and string_column
shark_df["datetime_column"] = shark_df["date"].apply(parse_date)
shark_df["string_column"] = shark_df["date"].apply(lambda x: x if isinstance(x, str) else None)

# Drop rows with invalid datetime values
shark_df = shark_df[shark_df["datetime_column"].notna()]

# Extract month and year from datetime_column
shark_df['month'] = shark_df["datetime_column"].apply(lambda x: x.month if pd.notnull(x) else None)
shark_df['year'] = shark_df["datetime_column"].apply(lambda x: x.year if pd.notnull(x) else None)
#shark_df['month'] = shark_df["datetime_column"].dt.month
#shark_df['year'] = shark_df["datetime_column"].dt.year

# Define season mapping
season_mapping = {
    "Spring": [3, 4, 5],
    "Summer": [6, 7, 8],
    "Autumn": [9, 10, 11],
    "Winter": [12, 1, 2]
}

# Function to assign season based on month
def what_season(month):
    for season, months in season_mapping.items():
        if month in months:
            return season
    return None

# Assign season based on the extracted month
shark_df['season'] = shark_df['month'].apply(what_season)

# Check the resulting DataFrame
print(shark_df[['date', 'datetime_column', 'string_column', 'year', 'month', 'season']])
shark_df.head()

## Fatality rates (Bru)

In [None]:
# rename column
shark_df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)

replacement_dict = {
    'N': 'no',
    'Y': 'yes',
    'M': 'unknown',
    'F': 'unknown',
    'n': 'no',
    'Nq': 'unknown'
}

#fill NaN vals with 'unknown' and replace unique values
shark_df['fatal'] = shark_df['fatal'].fillna('unknown').replace(replacement_dict)

fatality_counts = shark_df['fatal'].value_counts()
display("fatality counts", fatality_counts)

shark_df.head()

## Activity (Bru)

In [None]:
# Check unique values
unique_values_activity = shark_df['activity'].unique()
# print(unique_values_activity)

# Convert all values to a common case
shark_df['activity'] = shark_df['activity'].str.strip().str.lower().str.replace(r"[\"']", '', regex=True)

most_common_words = []

def word_count():
    global most_common_words  # Declare the global variable
    shark_df['activity'] = shark_df['activity'].fillna('').astype(str)  # replace NaN values with an empty string
    all_text = ' '.join(shark_df['activity'])  # combine all values into a single string
    words = re.findall(r'\w+', all_text.lower())  # split into words
    word_counts = Counter(words)  # count word frequency
    most_common_words = [word for word, count in word_counts.most_common(50) if len(word) >= 5]
    return most_common_words

# Call the function to get most common words
most_common_words = word_count()

print('Top values before replacement function\n', shark_df['activity'].value_counts().head(10))

# Manually entered selected values
selected_values_to_replace = ['surfing', 'diving', 'fishing', 'swimming', 'wading', 'bathing', 'snorkeling', 'kayaking', 'body boarding', 'scuba diving']

def replace_values(shark_df, selected_values_to_replace):
    for word_to_replace in selected_values_to_replace:
        shark_df.loc[shark_df['activity'].str.contains(word_to_replace, case=False, na=False), 'activity'] = word_to_replace
    return shark_df

# Replace values
shark_df = replace_values(shark_df, selected_values_to_replace)

# Assuming you want to filter based on the most common activities
top_10_activities = shark_df['activity'].value_counts().head(10).index  # Get top activities

# Filter shark_df based on top 10 activities
shark_df = shark_df[shark_df['activity'].isin(top_10_activities)]

# Display results
print("Top values after replacement:\n", shark_df['activity'].value_counts().head(10))
print("Filtered DataFrame:\n", shark_df.head(10))

In [None]:
shark_df['sex'] = shark_df['sex'].str.strip()

# Replace specific values
shark_df['sex'] = shark_df['sex'].replace({
    'M': 'M', 
    'F': 'F',  
    'N': np.nan,  
    'M x 2': 'M', 
    'lli': np.nan,  
    '.': np.nan,  
    ' M': 'M'  
})

shark_df['sex']= shark_df['sex'].fillna('unknown')

#Calculate the counts of "M" and "F"
total_known = shark_df['sex'].value_counts()
m_count = total_known.get('M', 0)
f_count = total_known.get('F', 0)
total = m_count + f_count

#Calculate the percentages of "M" and "F"
if total > 0:
    m_percentage = m_count / total
    f_percentage = f_count / total
else:
    m_percentage = 0.5  # Default to equal distribution if no known values
    f_percentage = 0.5

# Determine the number of "Unknown" values
unknown_count = shark_df['sex'].value_counts().get('unknown', 0)

# Calculate how many "Unknown" values to fill with "M" and "F"
m_fill_count = int(m_percentage * unknown_count)
f_fill_count = unknown_count - m_fill_count  # Ensure all "Unknown" are assigned

# Get indices of the "Unknown" entries
unknown_indices = shark_df[shark_df['sex'] == 'unknown'].index

# Randomly shuffle the "Unknown" indices
shuffled_indices = np.random.permutation(unknown_indices)

# Split the shuffled indices into two groups for "M" and "F"
m_indices = shuffled_indices[:m_fill_count]
f_indices = shuffled_indices[m_fill_count:]

# Assign "M" and "F" to the split indices
shark_df.loc[m_indices, 'sex'] = 'M'
shark_df.loc[f_indices, 'sex'] = 'F'

# Verify replacements by checking updated counts
print(shark_df['sex'].value_counts())

In [None]:
def convert_descriptive_age(value):
    if pd.isnull(value):
        return np.nan
    value = str(value).strip().lower()
    if value in ["teen", "teens"]:
        return 15  # Approximate age for teenagers
    elif value == "adult":
        return 30  # General average for adult age
    elif value in ["middle age", '"middle-age"']:
        return 45  # Approximate age for middle age
    elif value == "elderly":
        return 70  # Approximate age for elderly
    elif value in ["a minor", "young"]:
        return 10  # Assume a minor is around 10 years old
    elif value == "infant" or value == "9 months" or value == "2 to 3 months":
        return 1  # Age 1 for infants
    elif "month" in value:
        return 1  # Treat other month values as infants
    return value

shark_df['age'] = shark_df['age'].apply(convert_descriptive_age)

def convert_to_first_age(value):
    if isinstance(value, str):
        numbers = re.findall(r'\d+', value)
        if numbers:
            return int(numbers[0])  
    return value

shark_df['age'] = shark_df['age'].apply(convert_to_first_age)

def convert_half_age(value):
    if isinstance(value, str) and "½" in value:
        # Replace "½" with ".5" and convert to float
        return float(value.replace("½", ".5"))
    return value  

shark_df['age'] = shark_df['age'].apply(convert_half_age)


#Convert any remaining irregular entries to NaN
def convert_irregular_entries(value):
    if isinstance(value, str) and not any(char.isdigit() for char in value):
        return np.nan  
    return value

shark_df['age'] = shark_df['age'].apply(convert_irregular_entries)

#convert to numeric
shark_df['age'] = pd.to_numeric(shark_df['age'], errors='coerce')

#Replace NaN values with the mode of the age column
age_mode = shark_df['age'].mode()[0]
shark_df['age'] = shark_df['age'].fillna(age_mode)

#convert type to int
shark_df['age'] = shark_df['age'].astype(int)

In [None]:
shark_df.rename(columns={'unnamed:_11': 'fatal'}, inplace=True)

replacement_dict = {
    'N': 'no',
    'Y': 'yes',
    'M': 'unknown',
    'F': 'unknown',
    'n': 'no',
    'Nq': 'unknown'
}

#fill NaN vals with 'unknown' and replace unique values
shark_df['fatal'] = shark_df['fatal'].fillna('unknown').replace(replacement_dict)

fatality_counts = shark_df['fatal'].value_counts()
display("fatality counts", fatality_counts)

shark_df.head

In [None]:
# Function to parse different date formats
def parse_date(date_str):
    if isinstance(date_str, str):
        try: 
            return pd.to_datetime(date_str)  # Try direct conversion
        except ValueError:
            match = re.search(r'(\d{4}-\d{1,2}-\d{1,2}|\d{1,2}-[A-Za-z]{3}-\d{4}|\b[A-Za-z]{3}-\d{4}\b)', date_str)
            if match:
                date_str = match.group(0)
                try:
                    return datetime.strptime(date_str, "%Y-%m-%d")
                except ValueError:
                    try:
                        return datetime.strptime(date_str, "%d-%b-%Y")
                    except ValueError:
                        try:
                            return datetime.strptime(date_str, "%b-%Y")
                        except ValueError:
                            return None  # Return None for invalid formats
    elif isinstance(date_str, datetime):
        return date_str  # Return the datetime object as is
    return None  # Return None if not a string or datetime

# Create datetime_column and string_column
shark_df["datetime_column"] = shark_df["date"].apply(parse_date)
shark_df["string_column"] = shark_df["date"].apply(lambda x: x if isinstance(x, str) else None)

# Drop rows with invalid datetime values
shark_df = shark_df[shark_df["datetime_column"].notna()]

# Extract month and year from datetime_column
shark_df['month'] = shark_df["datetime_column"].apply(lambda x: x.month if pd.notnull(x) else None)
shark_df['year'] = shark_df["datetime_column"].apply(lambda x: x.year if pd.notnull(x) else None)
#shark_df['month'] = shark_df["datetime_column"].dt.month
#shark_df['year'] = shark_df["datetime_column"].dt.year

# Define season mapping
season_mapping = {
    "Spring": [3, 4, 5],
    "Summer": [6, 7, 8],
    "Autumn": [9, 10, 11],
    "Winter": [12, 1, 2]
}

# Function to assign season based on month
def what_season(month):
    for season, months in season_mapping.items():
        if month in months:
            return season
    return None

# Assign season based on the extracted month
shark_df['season'] = shark_df['month'].apply(what_season)

# Check the resulting DataFrame
print(shark_df[['date', 'datetime_column', 'string_column', 'year', 'month', 'season']])

In [None]:
shark_df.head()

In [None]:
# filter based on the most common countries
top_10_countries = shark_df['country'].value_counts().head(10).index  # Get top activities

# filter shark_df based on top 10 countries
shark_df = shark_df[shark_df['country'].isin(top_10_countries)]

In [None]:
# Capitalize names in 'country' except for 'usa' and handle two-word countries
shark_df['country'] = shark_df['country'].apply(lambda x: 
    ' '.join(word.capitalize() for word in x.split()) if x.lower() != 'usa' else x)

print(shark_df)

In [None]:
# Columns to remove
columns_to_remove = ['date', 'datetime_column', 'string_column', 'month']

# Remove the specified columns
shark_df = shark_df.drop(columns=columns_to_remove)

print(shark_df)

In [None]:
# Remove empty values in column 'activity'
shark_df = shark_df[shark_df['activity'].notna()]
shark_df = shark_df[shark_df['activity'].str.strip().ne('')]
shark_df = shark_df[shark_df['activity'].str.strip() != '']


print(shark_df)


In [None]:
shark_final_df = shark_df.to_csv('shark_final_df.csv', index=False)
shark_final_df