### Identifying data distribution and hyperparameter tuning for model development

In [1]:
# Import libraries and modules
import pandas as pd
import numpy as np
from datetime import datetime

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# Import dataset
shellfish_landings = pd.read_csv('ca_shellfish_landings.csv')

In [3]:
shellfish_landings.head()

Unnamed: 0,time,year,fish,port,landings
0,1928-01-16T00:00:00Z,1928,"abalone, black",All,0
1,1928-02-16T00:00:00Z,1928,"abalone, black",All,0
2,1928-03-16T00:00:00Z,1928,"abalone, black",All,0
3,1928-04-16T00:00:00Z,1928,"abalone, black",All,0
4,1928-05-16T00:00:00Z,1928,"abalone, black",All,0


In [4]:
# Rename the 'fish' column as 'species'
shellfish_landings = shellfish_landings.rename(columns={'fish':'species'})

In [5]:
shellfish_landings.dtypes

time        object
year         int64
species     object
port        object
landings     int64
dtype: object

In [6]:
# Convert the 'time' column to the pandas datatime format, handle UTC timezone suffix
shellfish_landings['time'] = pd.to_datetime(shellfish_landings['time'], utc=True)

# Create a column for month 
shellfish_landings['month'] = shellfish_landings['time'].dt.month

# Remove rows/observations where there is 0 or no landings recorded
shellfish_landings = shellfish_landings[shellfish_landings['landings'] != 0]

# Remove 'All' port entries to focus on individual ports 
shellfish_landings = shellfish_landings[shellfish_landings['port'] != 'All']

In [8]:
# Specify more common fisheries for investigation - 
abalone = [
    "abalone, black",
    "abalone, flat",
    "abalone, green",
    "abalone, pink",
    "abalone, white",
    "abalone, pinto",
    "abalone, red",
    "abalone, threaded",
    "abalone, unspecified"
]

clams = [
    "clam, gaper",
    "clam, california jackknife",
    "clam, native littleneck",
    "clam, pismo",
    "clam, purple",
    "clam, rosy razor",
    "clam, common washington",
    "clam, unspecified"
]

crabs = [
    "crab, dungeness",
    "crab, rock",
    "crab, brown rock",
    "crab, red rock",
    "crab, yellow rock",
    "crab, king",
    "crab, spider",
    "crab, tanner"
]

oysters = [
    "oyster, california",
    "oyster, eastern",
    "oyster, european flat",
    "oyster, giant pacific",
    "oyster, unspecified"
]

shrimp_prawn = [
    "shrimp, pacific ocean",
    "shrimp, bay",
    "shrimp, red rock",
    "prawn, ridgeback",
    "prawn, spotted",
    "prawn, golden",
    "shrimp, unspecified",
    "prawn, unspecified"
]

In [9]:
# Create a function to categorize species into groups
def get_species_group(species):
   if species in abalone:
       return 'Abalone'
   elif species in clams:
       return 'Clams'
   elif species in crabs:
       return 'Crabs'
   elif species in oysters:
       return 'Oysters'
   elif species in shrimp_prawn:
       return 'Shrimp & Prawns'
   else:
       return 'Other'

In [10]:
# Add species group column
shellfish_landings['species_group'] = shellfish_landings['species'].apply(get_species_group)

In [15]:
# Calculate monthly mean landings by species group
shellfish_landings['monthly_mean_landing'] = shellfish_landings.groupby(
    ['year', 'month', 'species_group', 'port'])['landings'].transform('mean')

In [16]:
def categorize_fishing_success(df):
    def safe_qcut(x):
        try:
            # Try to create quartiles, allowing for duplicate bin edges
            return pd.qcut(x, q=4, labels=['poor', 'fair', 'good', 'excellent'], duplicates='drop')
        except ValueError:
            # If we can't create quartiles (e.g., too few unique values),
            # use the median to create a simple binary classification
            median = x.median()
            return pd.Series(np.where(x > median, 'good', 'poor'), index=x.index)
    
    # Calculate quartiles for each species
    species_quartiles = df.groupby('species')['landings'].transform(safe_qcut)
    
    # Add new column
    df['fishing_success'] = species_quartiles
    
    return df


shellfish_landings = categorize_fishing_success(shellfish_landings)

print(shellfish_landings.groupby(['species', 'fishing_success']).size())

species              fishing_success
abalone, black       excellent          163
                     fair               163
                     good               162
                     poor               163
abalone, flat        good                 4
                                       ... 
shrimp, red rock     poor                35
shrimp, unspecified  excellent          162
                     fair               162
                     good               162
                     poor               162
Length: 143, dtype: int64


Unnamed: 0,time,year,species,port,landings,month,monthly_mean_landing,species_group,fishing_success
1604,1986-09-16 00:00:00+00:00,1986,"abalone, black",Eureka,5722,9,5722.0,Abalone,good
1605,1986-10-16 00:00:00+00:00,1986,"abalone, black",Eureka,198,10,198.0,Abalone,poor
1626,1988-07-16 00:00:00+00:00,1988,"abalone, black",Eureka,483,7,483.0,Abalone,poor
1637,1989-06-16 00:00:00+00:00,1989,"abalone, black",Eureka,560,6,560.0,Abalone,poor
1638,1989-07-16 00:00:00+00:00,1989,"abalone, black",Eureka,501,7,501.0,Abalone,poor


In [17]:
def save_fishing_success_summary(df, filename='fishing_success_summary.txt'):
    with open(filename, 'w') as f:
        # Write header
        f.write("Fishing Success Distribution Summary\n")
        f.write("=================================\n\n")
        
        # Write species group summary
        f.write("By Species Group:\n")
        f.write("-----------------\n")
        species_group_summary = df.groupby(['species_group', 'fishing_success']).size()
        f.write(species_group_summary.to_string())
        
        # Add spacing
        f.write("\n\nBy Individual Species:\n")
        f.write("--------------------\n")
        species_summary = df.groupby(['species', 'fishing_success']).size()
        f.write(species_summary.to_string())
        
        # Add total counts at the end
        f.write("\n\nTotal Observations:\n")
        f.write("-----------------\n")
        f.write(f"Total records: {len(df)}\n")
        f.write(f"Number of species groups: {df['species_group'].nunique()}\n")
        f.write(f"Number of individual species: {df['species'].nunique()}\n")

# Save the summary
save_fishing_success_summary(shellfish_landings)
print("Summary has been saved to 'fishing_success_summary.txt'")

Summary has been saved to 'fishing_success_summary.txt'
