In [17]:
import pandas as pd
from pathlib import Path

# Get the path to the data file
data_path = Path('..') / 'data' / 'raw' / 'california_fish_landings_1928_2002.csv'
ca_all_fish_landings = pd.read_csv(data_path)

# Get unique fish species and sort alphabetically
species = sorted(ca_all_fish_landings['fish'].unique())

# Print the list in a clean format
print(f"List of {len(species)} unique species:\n")
print("=" * 50)
for i, fish in enumerate(species, 1):
    print(f"{i:3d}. {fish}")

"""
# Moving forward exports to .csv files are commented out for future referencing
species_df = pd.DataFrame(species, columns=['Species'])
species_df.to_csv('california_fish_species_list.csv', index=True, index_label='Number')
print("Species list saved to 'california_fish_species_list.csv'") 
"""

List of 341 unique species:

  1. Abalone, Black
  2. Abalone, Flat
  3. Abalone, Green
  4. Abalone, Pink
  5. Abalone, Pinto
  6. Abalone, Red
  7. Abalone, Threaded
  8. Abalone, Unspecified
  9. Abalone, White
 10. Anchovy, Deepbody
 11. Anchovy, Northern
 12. Barnacle
 13. Barracuda, California
 14. Bass, Barred Sand
 15. Bass, Giant Sea
 16. Bass, Kelp
 17. Bass, Rock
 18. Bass, Spotted Sand
 19. Bass, Striped
 20. Blackfish, Sacramento
 21. Blacksmith
 22. Bonefish
 23. Bonito, Pacific
 24. Box Crab
 25. Butterfish, Pacific
 26. Cabezon
 27. Cabrilla, Spotted
 28. Carp
 29. Catfish
 30. Chiton
 31. Clam, California Jackknife
 32. Clam, Common Washington
 33. Clam, Gaper
 34. Clam, Native Littleneck
 35. Clam, Nothern Razor
 36. Clam, Pismo
 37. Clam, Purple
 38. Clam, Rosy Razor
 39. Clam, Soft Shelled
 40. Clam, Unspecified
 41. Cod, Pacific
 42. Corbina, California
 43. Corvina, Shortfin
 44. Crab, Brown Rock
 45. Crab, Claws
 46. Crab, Dungeness
 47. Crab, King
 48. Crab, Pel

'\n# Moving forward exports to .csv files are commented out for future referencing\nspecies_df = pd.DataFrame(species, columns=[\'Species\'])\nspecies_df.to_csv(\'california_fish_species_list.csv\', index=True, index_label=\'Number\')\nprint("Species list saved to \'california_fish_species_list.csv\'") \n'

In [18]:
# Create some basic statistics about species naming patterns
print("\nSpecies Categories Analysis:")
print("=" * 50)

# Count species by certain categories
rockfish_count = sum(1 for s in species if 'Rockfish' in s)
shark_count = sum(1 for s in species if 'Shark' in s)
tuna_count = sum(1 for s in species if 'Tuna' in s)
salmon_count = sum(1 for s in species if 'Salmon' in s)

print(f"\nRockfish species: {rockfish_count}")
print(f"Shark species: {shark_count}")
print(f"Tuna species: {tuna_count}")
print(f"Salmon species: {salmon_count}")


Species Categories Analysis:

Rockfish species: 58
Shark species: 23
Tuna species: 8
Salmon species: 7


In [19]:
ca_all_fish_landings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2148300 entries, 0 to 2148299
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   time      object
 1   year      int64 
 2   fish      object
 3   port      object
 4   landings  int64 
dtypes: int64(2), object(3)
memory usage: 82.0+ MB


In [20]:
crustaceans = [
    "barnacle",
    "box crab",
    "crab, claws",
    "crab, dungeness",
    "crab, king",
    "crab, pelagic red",
    "crab, rock",
    "crab, brown rock",
    "crab, red rock",
    "crab, yellow rock",
    "crab, sand",
    "crab, shore",
    "crab, spider",
    "crab, tanner",
    "crayfish",
    "lobster, california spiny", 
    "prawn, golden",
    "prawn, ridgeback",
    "prawn, spotted",
    "prawn, unspecified",
    "shrimp, bay",
    "shrimp, brine",
    "shrimp, coonstripe",
    "shrimp, ghost",
    "shrimp, pacific ocean",
    "shrimp, red rock",
    "shrimp, unspecified",
    "crustacean, unspecified"
]

echinoderms = [
    "cucumber, sea",
    "echinoderm, unspecified",
    "sea stars",
    "urchin, lytechinus",
    "urchin, purple sea",
    "urchin, red sea",
    "urchin, unspecified"
]

mollusks = [
    "abalone, black",
    "abalone, flat",
    "abalone, green",
    "abalone, pink",
    "abalone, white", 
    "abalone, threaded",
    "abalone, pinto",
    "abalone, red",
    "abalone, unspecified",
    "chiton",
    "clam, gaper",
    "clam, california jackknife",
    "clam, native littleneck",
    "clam, northern razor",
    "clam, purple",
    "clam, pismo",
    "clam, rosy razor",
    "clam, soft-shelled",
    "clam, common washington", 
    "clam, unspecified",
    "limpet, unspecified",
    "mollusk, unspecified", 
    "mussel",
    "octopus, unspecified",
    "oyster, california",
    "oyster, eastern",
    "oyster, european flat",
    "oyster, giant pacific",
    "oyster, unspecified",
    "scallop, unspecified",
    "sea hares",
    "sea slug",
    "snail, sea",
    "squid, jumbo",
    "squid, market",
    "whelk"
]

In [21]:
# Combine all shellfish genera/phyla lists into one
shellfish = crustaceans + echinoderms + mollusks

# Convert shellfish list to lowercase
shellfish_lower = [x.lower() for x in shellfish]

# Create new DataFrame, converting 'fish' column to lowercase first
ca_all_fish_landings['fish'] = ca_all_fish_landings['fish'].str.lower()
ca_all_shellfish_landings = ca_all_fish_landings[ca_all_fish_landings['fish'].isin(shellfish_lower)].copy()

# Print info about the new DataFrame
print(f"Original DataFrame rows: {len(ca_all_fish_landings)}")
print(f"Shellfish DataFrame rows: {len(ca_all_shellfish_landings)}")
print("\nShellfish DataFrame info:")
print(ca_all_shellfish_landings.info())

Original DataFrame rows: 2148300
Shellfish DataFrame rows: 428400

Shellfish DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 428400 entries, 0 to 2104199
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   time      428400 non-null  object
 1   year      428400 non-null  int64 
 2   fish      428400 non-null  object
 3   port      428400 non-null  object
 4   landings  428400 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 19.6+ MB
None


In [22]:
# Check that just the shellfish species have been moved to the new Shellfish DataFrame 
# Rename 'fish' to 'species'

ca_all_shellfish_landings = ca_all_shellfish_landings.rename(columns={'fish':'species'})

ca_all_shellfish_landings['species'].unique()


array(['abalone, black', 'abalone, flat', 'abalone, green',
       'abalone, pink', 'abalone, pinto', 'abalone, red',
       'abalone, threaded', 'abalone, unspecified', 'abalone, white',
       'barnacle', 'box crab', 'chiton', 'clam, california jackknife',
       'clam, common washington', 'clam, gaper',
       'clam, native littleneck', 'clam, pismo', 'clam, purple',
       'clam, rosy razor', 'clam, unspecified', 'crab, brown rock',
       'crab, claws', 'crab, dungeness', 'crab, king',
       'crab, pelagic red', 'crab, red rock', 'crab, rock', 'crab, sand',
       'crab, shore', 'crab, spider', 'crab, tanner', 'crab, yellow rock',
       'crayfish', 'crustacean, unspecified', 'cucumber, sea',
       'limpet, unspecified', 'lobster, california spiny',
       'mollusk, unspecified', 'mussel', 'octopus, unspecified',
       'oyster, california', 'oyster, eastern', 'oyster, european flat',
       'oyster, giant pacific', 'oyster, unspecified', 'prawn, golden',
       'prawn, ridgebac

In [23]:
# Specify more common fisheries for investigation - 
abalone = [
    "abalone, black",
    "abalone, flat",
    "abalone, green",
    "abalone, pink",
    "abalone, white",
    "abalone, pinto",
    "abalone, red",
    "abalone, threaded",
    "abalone, unspecified"
]

clams = [
    "clam, gaper",
    "clam, california jackknife",
    "clam, native littleneck",
    "clam, pismo",
    "clam, purple",
    "clam, rosy razor",
    "clam, common washington",
    "clam, unspecified"
]

crabs = [
    "crab, dungeness",
    "crab, rock",
    "crab, brown rock",
    "crab, red rock",
    "crab, yellow rock",
    "crab, king",
    "crab, spider",
    "crab, tanner"
]

oysters = [
    "oyster, california",
    "oyster, eastern",
    "oyster, european flat",
    "oyster, giant pacific",
    "oyster, unspecified"
]

shrimp_prawn = [
    "shrimp, pacific ocean",
    "shrimp, bay",
    "shrimp, red rock",
    "prawn, ridgeback",
    "prawn, spotted",
    "prawn, golden",
    "shrimp, unspecified",
    "prawn, unspecified"
]

In [24]:
# Combine all common shellfish groups
common_shellfish = abalone + clams + crabs + oysters + shrimp_prawn

# Convert common shellfish list to lowercase
common_shellfish_lower = [x.lower() for x in shellfish]

In [25]:
# Rename the first DataFrame (with all shellfish species)
all_shellfish_landings = ca_all_fish_landings[ca_all_fish_landings['fish'].isin(shellfish_lower)].copy()

# Create new DataFrame with only common shellfish species
ca_shellfish_landings = ca_all_fish_landings[ca_all_fish_landings['fish'].isin(common_shellfish_lower)].copy()



# Print info about both DataFrames
print(f"Original DataFrame rows: {len(ca_all_fish_landings)}")
print(f"All Shellfish DataFrame rows: {len(all_shellfish_landings)}")
print(f"Common Shellfish DataFrame rows: {len(ca_shellfish_landings)}")
print("\nCommon Shellfish DataFrame info:")
print(ca_shellfish_landings.info())

# Optionally save to CSV
# all_shellfish_landings.to_csv('all_shellfish_landings.csv', index=False)
# ca_shellfish_landings.to_csv('ca_shellfish_landings2.csv', index=False)

#TODO:? issue with all_shellfish_landings not lowered and failing to match with the common_shellfish_lower? 
# Working off saved ca_shellfish_landings.csv for next notebook

Original DataFrame rows: 2148300
All Shellfish DataFrame rows: 428400
Common Shellfish DataFrame rows: 428400

Common Shellfish DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 428400 entries, 0 to 2104199
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   time      428400 non-null  object
 1   year      428400 non-null  int64 
 2   fish      428400 non-null  object
 3   port      428400 non-null  object
 4   landings  428400 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 19.6+ MB
None
