<a href="https://colab.research.google.com/github/calebarr/AIS/blob/ais_cleaning/AIS_Downloader_TeamShared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
# AIS Data Downloader - Google Colab Version (Team-Ready)

# Mount Google Drive
from google.colab import drive
import os
from datetime import datetime, timedelta
import requests
import pandas as pd
import zipfile

print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define the AIS downloader function
def download_ais_data(start_date_str, end_date_str, save_folder):
    os.makedirs(save_folder, exist_ok=True)
    print(f"Files will be saved to: {save_folder}")

    # Convert string dates to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")


    # Download each file in the date range
    for i in range((end_date - start_date).days + 1):
        date_obj = start_date + timedelta(days=i)
        filename = f"AIS_{date_obj.strftime('%Y_%m_%d')}.zip"
        url = f"https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2020/{filename}"
        file_path = os.path.join(save_folder, filename)

        if not os.path.exists(file_path):
            print(f"Downloading {filename}...")
            response = requests.get(url)
            if response.status_code == 200:
                with open(file_path, "wb") as f:
                    f.write(response.content)
                print(f"Saved: {file_path}")
            else:
                print(f"Failed: {response.status_code}")
        else:
            print(f"Already downloaded: {filename}")

# Now call the function with your chosen parameters
download_ais_data(
    start_date_str="2020-01-01",
    end_date_str="2020-01-02",
    save_folder="/content/drive/My Drive/SIADS_593/assets/raw_data"
)


Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files will be saved to: /content/drive/My Drive/SIADS_593/assets/raw_data
Already downloaded: AIS_2020_01_01.zip
Already downloaded: AIS_2020_01_02.zip


In [101]:
!ls -lh "/content/drive/My Drive/SIADS_593/assets/raw_data"


total 502M
-rw------- 1 root root 252M May 28 04:52 AIS_2020_01_01.zip
-rw------- 1 root root 250M May 28 04:52 AIS_2020_01_02.zip
drwx------ 2 root root 4.0K May 28 04:51 old_data


In [102]:
def process_zip_in_chunks(zip_path, chunksize=100_000):
    import zipfile, pandas as pd

    all_cleaned_chunks = []
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        csv_files = [name for name in zip_ref.namelist() if name.endswith('.csv')]
        if not csv_files:
            return pd.DataFrame()

        with zip_ref.open(csv_files[0]) as f:
            reader = pd.read_csv(f, chunksize=chunksize)
            for chunk in reader:
                cleaned = extract_first_arrivals_anywhere(chunk)
                all_cleaned_chunks.append(cleaned)

    return pd.concat(all_cleaned_chunks, ignore_index=True)


In [103]:
def extract_first_arrivals_anywhere(df):
    # Filter by relevant vessel types
    df = df[
        df["VesselType"].isin(range(70, 90)) | df["VesselType"].isin([30, 52])
    ].copy()

    # Drop rows with missing or invalid coordinates
    df = df.dropna(subset=["LAT", "LON"])
    df = df[(df["LAT"] != 0) & (df["LON"] != 0)]

    # Convert timestamps
    df["BaseDateTime"] = pd.to_datetime(df["BaseDateTime"], errors='coerce')

    df = df.dropna(subset=["BaseDateTime"])

    # Sort and get the first ping per MMSI
    first_arrivals = (
        df.sort_values(["MMSI", "BaseDateTime"])
          .drop_duplicates("MMSI", keep="first")
    )

    # Assign port names
    return assign_port_names(first_arrivals)


In [104]:
def assign_port_names(df, buffer=0.5):
    PORT_REGIONS = {
        "Los Angeles": (33.6, 33.9, -118.5, -118.0),
        "Long Beach": (33.7, 33.9, -118.25, -118.15),
        "Oakland": (37.7, 37.85, -122.35, -122.2),
        "Seattle": (47.5, 47.7, -122.4, -122.2),
        "New York": (40.6, 40.8, -74.1, -73.9),
        "Norfolk": (36.8, 37.1, -76.4, -76.2),
        "Savannah": (32.0, 32.2, -81.2, -80.8),
        "Charleston": (32.7, 32.9, -80.0, -79.8),
        "Miami": (25.75, 25.85, -80.2, -80.0),
        "Port Everglades": (26.05, 26.1, -80.15, -80.1),
        "Baltimore": (39.2, 39.3, -76.6, -76.5),
        "Philadelphia": (39.9, 40.0, -75.2, -75.1),
        "Houston": (29.6, 29.8, -95.2, -94.8),
        "New Orleans": (29.9, 30.1, -90.1, -89.9),
        "Jacksonville": (30.3, 30.5, -81.7, -81.3),
        "San Diego": (32.7, 32.8, -117.2, -117.1),
        "Boston": (42.3, 42.4, -71.1, -70.9),
        "Anchorage": (61.1, 61.3, -149.95, -149.8),
        "Honolulu": (21.3, 21.4, -157.9, -157.8),
        "Unknown": (None, None, None, None),  # fallback
    }

    def get_port_name(lat, lon):
        for port, bounds in PORT_REGIONS.items():
            if None in bounds:
                continue  # skip 'Unknown'
            min_lat, max_lat, min_lon, max_lon = bounds
            if (min_lat - buffer) <= lat <= (max_lat + buffer) and \
               (min_lon - buffer) <= lon <= (max_lon + buffer):
                return port
        return "Unknown"

    df["Port Name"] = df.apply(lambda row: get_port_name(row["LAT"], row["LON"]), axis=1)
    return df



In [105]:
import pandas as pd

test_df = pd.concat([
    process_zip_in_chunks("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_01.zip"),
    process_zip_in_chunks("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_02.zip")
])





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [106]:
first_arrivals_df = extract_first_arrivals_anywhere(test_df)
first_arrivals_df["Port Name"].value_counts()



Unnamed: 0_level_0,count
Port Name,Unnamed: 1_level_1
Unknown,2158
Houston,123
Seattle,107
Norfolk,63
Honolulu,62
New Orleans,57
Los Angeles,52
Oakland,46
New York,45
Miami,27


In [107]:
import plotly.express as px

# Sample 500 points to make rendering faster, or skip .sample() to show all
sample_df = first_arrivals_df.sample(n=500, random_state=42)

fig = px.scatter_geo(
    sample_df,
    lat='LAT',
    lon='LON',
    color='Port Name',
    hover_name='VesselType',
    title='First Arrivals: Vessel Locations by Port',
    projection="natural earth"
)

fig.update_layout(geo=dict(
    scope='world',
    showland=True,
    landcolor="lightgray",
))

fig.show()


In [108]:
def process_zip_in_chunks_filtered(zip_path, chunksize=100_000):
    import zipfile, pandas as pd

    all_filtered_chunks = []
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        csv_files = [name for name in zip_ref.namelist() if name.endswith('.csv')]
        if not csv_files:
            return pd.DataFrame()

        with zip_ref.open(csv_files[0]) as f:
            reader = pd.read_csv(f, chunksize=chunksize)
            for chunk in reader:
                # Filter by relevant vessel types
                filtered_chunk = chunk[
                    chunk["VesselType"].isin(range(70, 90)) | chunk["VesselType"].isin([30, 52])
                ].copy()

                # Drop rows with missing or invalid coordinates
                filtered_chunk = filtered_chunk.dropna(subset=["LAT", "LON"])
                filtered_chunk = filtered_chunk[(filtered_chunk["LAT"] != 0) & (filtered_chunk["LON"] != 0)]

                # Convert timestamps
                filtered_chunk["BaseDateTime"] = pd.to_datetime(filtered_chunk["BaseDateTime"], errors='coerce')
                filtered_chunk = filtered_chunk.dropna(subset=["BaseDateTime"])

                all_filtered_chunks.append(filtered_chunk)

    return pd.concat(all_filtered_chunks, ignore_index=True)

In [109]:
import pandas as pd

# Process each zip file to get filtered data
all_filtered_data = pd.concat([
    process_zip_in_chunks_filtered("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_01.zip"),
    process_zip_in_chunks_filtered("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_02.zip")
])

# Now find the true first arrivals from the combined filtered data
first_arrivals_df = (
    all_filtered_data.sort_values(["MMSI", "BaseDateTime"])
    .drop_duplicates("MMSI", keep="first")
)

# Finally, assign port names to the first arrivals
first_arrivals_df = assign_port_names(first_arrivals_df)

# Now you can analyze first_arrivals_df
first_arrivals_df["Port Name"].value_counts()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



Unnamed: 0_level_0,count
Port Name,Unnamed: 1_level_1
Unknown,2158
Houston,123
Seattle,107
Norfolk,63
Honolulu,62
New Orleans,57
Los Angeles,52
Oakland,46
New York,45
Miami,27


In [110]:
import plotly.express as px

# Sample 500 points to make rendering faster, or skip .sample() to show all
sample_df = first_arrivals_df.sample(n=500, random_state=42)

fig = px.scatter_geo(
    sample_df,
    lat='LAT',
    lon='LON',
    color='Port Name',
    hover_name='VesselType',
    title='First Arrivals: Vessel Locations by Port',
    projection="natural earth"
)

fig.update_layout(geo=dict(
    scope='world',
    showland=True,
    landcolor="lightgray",
))

fig.show()

In [111]:
import pandas as pd

# 1. Create a sample DataFrame
data = {
    'MMSI': [1, 2, 3, 4, 5, 6, 7],
    'BaseDateTime': ['2023-01-01 10:00:00', '2023-01-01 10:05:00', '2023-01-01 10:10:00', '2023-01-01 10:15:00', '2023-01-01 10:20:00', '2023-01-01 10:25:00', '2023-01-01 10:30:00'],
    'LAT': [34.0, 34.1, 34.2, 34.3, 34.4, 34.5, 34.6],
    'LON': [-118.0, -118.1, -118.2, -118.3, -118.4, -118.5, -118.6],
    'VesselType': [70, 85, 30, 52, 20, 60, 91] # 20, 60, and 91 should be dropped
}
sample_df = pd.DataFrame(data)

# Convert BaseDateTime to datetime objects
sample_df["BaseDateTime"] = pd.to_datetime(sample_df["BaseDateTime"], errors='coerce')


# 2. Check unique values in the original DataFrame
print("Unique Vessel Types in Original DataFrame:")
print(sample_df["VesselType"].unique())

# Define the filtering logic
allowed_vessel_types = range(70, 90)
specific_vessel_types = [30, 52]

# 3. Apply the filtering logic
filtered_df = sample_df[
    sample_df["VesselType"].isin(allowed_vessel_types) | sample_df["VesselType"].isin(specific_vessel_types)
].copy()

# You can also include the other filtering steps from your function
# filtered_df = filtered_df.dropna(subset=["LAT", "LON"])
# filtered_df = filtered_df[(filtered_df["LAT"] != 0) & (filtered_df["LON"] != 0)]
# filtered_df = filtered_df.dropna(subset=["BaseDateTime"])


# 4. Check unique values in the filtered DataFrame
print("\nUnique Vessel Types in Filtered DataFrame:")
print(filtered_df["VesselType"].unique())

# You can also check the shape of the DataFrames to see how many rows were removed
print("\nOriginal DataFrame shape:", sample_df.shape)
print("Filtered DataFrame shape:", filtered_df.shape)

Unique Vessel Types in Original DataFrame:
[70 85 30 52 20 60 91]

Unique Vessel Types in Filtered DataFrame:
[70 85 30 52]

Original DataFrame shape: (7, 5)
Filtered DataFrame shape: (4, 5)


In [112]:
import plotly.express as px
import pandas as pd

# Plotting just the unknowns to see where they are going


if 'first_arrivals_df' in locals() and not first_arrivals_df.empty:

    # Filter the DataFrame to include only "Unknown" ports
    unknown_arrivals_df = first_arrivals_df[first_arrivals_df['Port Name'] == 'Unknown'].copy()

    # Check if there are any unknown arrivals to plot
    if not unknown_arrivals_df.empty:

        # Create the scatter geo plot for unknown arrivals
        fig = px.scatter_geo(
            unknown_arrivals_df,
            lat='LAT',
            lon='LON',
            title='First Arrivals with "Unknown" Port Names',
            projection="natural earth",
            hover_name='VesselType' # You can still hover to see vessel type
        )

        # Customize the map layout
        fig.update_layout(geo=dict(
            scope='world', # You can change this scope if you want to focus on a specific region
            showland=True,
            landcolor="lightgray",
        ))

        # Show the plot
        fig.show()
    else:
        print("No 'Unknown' port names found in the first_arrivals_df.")
else:
    print("first_arrivals_df is not available or is empty. Please run the previous cells to create it.")