In [1]:
import re
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Tier5_logentries-export-2024-08-20.csv',low_memory = False) 

In [3]:
# Columns to keep
columns_to_keep = ['DateTime', 'ID', 'LogBook ID', 'Latitude', 'Longitude', 'Ship Sightings']

# Drop columns except for the ones to keep
df = df[columns_to_keep]

In [7]:
# create a dataframe of usable data
df['usable'] = (df["Latitude"].notna() & df["Longitude"].notna() & df["Ship Sightings"].notna())
df_u = df[df['usable']==True].copy()

In [8]:
# convert 'Logbook ID' to string
ship_id = df_u['LogBook ID'].map(str)
# identify 
ship_names = ship_id.apply(lambda x : re.findall(r'\b[A-Z][a-z]+\b',x))

In [9]:
ship_names_flat = [item for sublist in ship_names for item in sublist]
unique_ships = list(pd.Series(ship_names_flat).unique()) 

stopwords = ['Bark', 'Ship',"St"]
for word in list(unique_ships):
    if word in stopwords:
        unique_ships.remove(word)
        
print(unique_ships)

['Abigail', 'Abraham', 'Barker', 'Addison', 'Adeline', 'Gibbs', 'Alaska', 'Albion', 'Alto', 'American', 'Arab', 'Arbella', 'Atlantic', 'Awashonks', 'Barclay', 'Barnstable', 'Bartholomew', 'Gosnold', 'Beaver', 'Benjamin', 'Rush', 'Bertha', 'Blackstone', 'Brandt', 'Brunswick', 'Comstock', 'Schooner', 'Cadmus', 'Cambria', 'Catalpa', 'Chariot', 'Charles', 'Drew', 'Phelps', 'Morgan', 'Henry', 'Clarice', 'Clarkson', 'Clifford', 'Wayne', 'Columbia', 'Coronet', 'Courier', 'Courser', 'Desdemona', 'Draco', 'Eagle', 'Emerald', 'Eunice', 'Adams', 'Brig', 'Express', 'Fabius', 'Fortune', 'Franklin', 'Gage', 'Phillips', 'General', 'Jackson', 'George', 'Clinton', 'Gideon', 'Howland', 'Good', 'Return', 'Governor', 'Carver', 'Greyhound', 'Harrison', 'Kneeland', 'Hercules', 'Hibernia', 'Homer', 'India', 'Isabella', 'Israel', 'Java', 'John', 'Dawson', 'Lafayette', 'Lancer', 'Leonidas', 'Lexington', 'Margaret', 'Maria', 'Martha', 'Mary', 'Frazier', 'Mitchell', 'Matilda', 'Sears', 'Mechanic', 'Medford', 'Me

In [12]:
#copy dataframe
df_u = df_u.copy()
# find maches in ship sightings from unique ships
df_u['matches'] = np.where(df_u['Ship Sightings'].str.contains('|'.join(unique_ships)), 'match', 'not match')
# isolate entries where there is no match within the dataset
df_nm = df_u[df_u['matches'] == 'not match'].copy()

In [25]:
# regex pattern to identify capitalized words as well as variations of the word Capt.
pattern = r"\b[A-Z][a-zA-Z]+\b|\bCapt\.\b|\bCapt\b|\bCaptain\b|\bCap\b"

# mark variations of "captain" to skip the word immediately following as to not confuse captain name with ship name
skip_next_trigger_words = {"Capt", "Capt.", "Captain", "Cap"}

# Words to skip explicitly (common capitalized words that show up in dataset that are unlikely to be ship names)
skip_words = {"Bark","Ship","Saw","Several","One","Spoke","Gammed"}


def clean_extracted(text):
    words = re.findall(pattern, text)  # Extract all words
    filtered_words = []
    skip_next = False

    for word in words:
        if skip_next:
            skip_next = False
            continue
        if word in skip_next_trigger_words: # skip words after captain
            skip_next = True
        elif word not in skip_words: # append words not in skip words
            filtered_words.append(word)

    return " ".join(filtered_words)  # Convert list to a single string before returning

# Apply function to extract ship names
df_nm.loc[: , "Extracted Ships"] = df_nm["Ship Sightings"].astype(str).apply(clean_extracted)

# Remove rows where Extracted Ships is empty
df_filtered = df_nm[df_nm["Extracted Ships"].str.strip() != ""]

# Display filtered DataFrame
df_filtered.head(25)

Unnamed: 0,DateTime,ID,LogBook ID,Latitude,Longitude,Ship Sightings,usable,matches,Extracted Ships
1036,1847-12-02 12:00:00,19631.0,Abigail (ship) 1847-1850,17.05,-25.466667,"""the Bark (Hope) in company""",True,not match,Hope
1037,1847-12-03 12:00:00,19632.0,Abigail (ship) 1847-1850,16.633333,-26.066667,"""the Bark (Hope) in company""",True,not match,Hope
1040,1847-12-06 12:00:00,19646.0,Abigail (ship) 1847-1850,14.866667,-24.733333,"""in company with the Bark (Hope)""",True,not match,Hope
1236,1848-06-29 12:00:00,20275.0,Abigail (ship) 1847-1850,30.806667,166.4,"""the Capt of the Jappan (Japan) of Nantucket.....",True,not match,Japan Nantucket
1259,1848-07-22 12:00:00,20336.0,Abigail (ship) 1847-1850,31.25,164.833333,"""spoke the ship Howard of Nantuckett Capt Bunk...",True,not match,Howard Nantuckett
1263,1848-07-26 12:00:00,20344.0,Abigail (ship) 1847-1850,30.458333,166.758333,"""the Narragansett of Nantuckett passed us""",True,not match,Narragansett Nantuckett
1267,1848-07-30 12:00:00,20372.0,Abigail (ship) 1847-1850,30.655556,169.488889,"""in company with the Narragansett""",True,not match,Narragansett
1269,1848-08-01 12:00:00,20377.0,Abigail (ship) 1847-1850,30.861111,170.827778,"""in company with the Narragansett""",True,not match,Narragansett
1270,1848-08-02 12:00:00,20379.0,Abigail (ship) 1847-1850,30.963889,171.497222,"""gamed with N (Narragansett)""",True,not match,Narragansett
1271,1848-08-03 12:00:00,20381.0,Abigail (ship) 1847-1850,31.066667,172.166667,"""in company with the N (Narragansett)""",True,not match,Narragansett


In [35]:
# Get value counts for each ship
value_counts = df_filtered['Extracted Ships'].value_counts()

# Filter ships mentioned more than 4 times
ships4 = value_counts[value_counts > 4]

# Print each ship and its count
for ship, count in ships4.items():
    print(f"{ship}: {count}")

# Print the total number of ships matching the filter
print(f"\nTotal ships mentioned more than 4 times: {len(ships4)}")

Lucas: 175
Maine: 33
BK Platina: 32
Milton: 30
Liverpool New Bedford: 21
BK Islander: 20
The Uncas: 18
Pacific: 18
Alpha: 16
London Packet: 16
China: 15
Logan: 14
Cherokee: 14
The Spartan: 13
BK Maine: 13
Liverpool: 13
Elizabeth: 12
Kathleen: 12
The Iris: 12
Narragansett: 12
Mercury New Bedford: 11
Mercator: 11
The Nickerson: 11
Alfred New Bedford: 11
Lucas Cicero: 11
Two: 11
Phoenix: 10
The Holmes: 10
BK Derwent Hunter: 10
Congress: 10
Gay Head: 10
Falcon: 10
Petrel: 10
The Christopher Mitchel: 10
Alexander Coffin: 10
Zephyr: 9
Lady Blackwood: 9
The Ganges: 9
Hope: 9
Edward New Bedford: 9
The Benj Tucker: 9
Milwood New Bedford: 9
Origon: 9
Griffin: 9
Lima: 8
Selma: 8
Wanderer: 8
Morning Star: 8
Elizabeth London: 8
Noble: 8
Platina: 8
Solon Mattapoisett: 8
Otranto: 8
Friendship: 8
Cortes: 8
Millwood: 8
Mariner: 8
Parker Cook Provincetown: 7
Florida: 7
Phocion: 7
Barque Kathleen: 7
Hope New Bedford: 7
Waverly New Bedford: 7
Herald: 7
Rose: 7
Simmons: 7
Columbus: 7
Andrew Hicks: 7
Draper

In [25]:
# Save the unique ship names to a text file
with open("unique_ships.txt", "w") as file:
    for ship in ships4:
        file.write(ship + "\n")