<a href="https://colab.research.google.com/github/calebarr/AIS/blob/ais_cleaning/AIS_Downloader_TeamShared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# AIS Data Downloader - Google Colab Version (Team-Ready)

# Mount Google Drive
from google.colab import drive
import os
from datetime import datetime, timedelta
import requests

print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define the AIS downloader function
def download_ais_data(start_date_str, end_date_str, save_folder):
    os.makedirs(save_folder, exist_ok=True)
    print(f"Files will be saved to: {save_folder}")

    # Convert string dates to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

    # Generate the date list
    dates_to_download = [
        (start_date + timedelta(days=i)).strftime("%Y-%m-%d")
        for i in range((end_date - start_date).days + 1)
    ]

    # Download each file in the date range
    for date_str in dates_to_download:
        date_obj = datetime.strptime(date_str, "%Y-%m-%d")
        filename = f"AIS_{date_obj.strftime('%Y_%m_%d')}.zip"
        url = f"https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2020/{filename}"

        print(f"Downloading {filename}...")
        response = requests.get(url)

        if response.status_code == 200:
            file_path = os.path.join(save_folder, filename)
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Saved: {file_path}")
        else:
            print(f"Failed to download {filename} (HTTP {response.status_code})")

# Now call the function with your chosen parameters
download_ais_data(
    start_date_str="2020-01-05",
    end_date_str="2020-01-06",
    save_folder="/content/drive/My Drive/SIADS_593/assets/raw_data"
)


Mounting Google Drive...
Mounted at /content/drive
Files will be saved to: /content/drive/My Drive/SIADS_593/assets/raw_data
Downloading AIS_2020_01_05.zip...
Saved: /content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_05.zip
Downloading AIS_2020_01_06.zip...
Saved: /content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_06.zip


In [2]:
!ls -lh "/content/drive/My Drive/SIADS_593/assets/raw_data"


total 1.3G
-rw------- 1 root root 252M May 21 06:02 AIS_2020_01_01.zip
-rw------- 1 root root 250M May 21 06:02 AIS_2020_01_02.zip
-rw------- 1 root root 256M May 21 06:02 AIS_2020_01_03.zip
-rw------- 1 root root 245M May 27 06:55 AIS_2020_01_05.zip
-rw------- 1 root root 254M May 27 06:55 AIS_2020_01_06.zip


In [4]:
import zipfile
import pandas as pd

def inspect_zip_sample(zip_path, nrows=5):

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        csv_files = [name for name in zip_ref.namelist() if name.endswith('.csv')]
        print("Files inside zip:", csv_files)

        with zip_ref.open(csv_files[0]) as f:
            df = pd.read_csv(f, nrows=nrows)

    print("Columns:", df.columns.tolist())
    return df



In [6]:
sample_df = inspect_zip_sample("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_05.zip")
sample_df.head()

Files inside zip: ['AIS_2020_01_05.csv']
Columns: ['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading', 'VesselName', 'IMO', 'CallSign', 'VesselType', 'Status', 'Length', 'Width', 'Draft', 'Cargo', 'TransceiverClass']


Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,367005220,2020-01-05T00:00:00,30.06265,-93.35127,5.1,227.0,511.0,,,,,0,,,,,A
1,215560000,2020-01-05T00:00:00,34.64002,-121.01854,16.2,334.2,342.0,,,,,0,,,,,A
2,636018760,2020-01-05T00:00:00,28.78695,-116.24974,12.8,343.4,345.0,,,,,0,,,,,A
3,368099320,2020-01-05T00:00:00,30.05953,-93.38022,4.4,270.4,511.0,,,,,12,,,,,A
4,368071490,2020-01-05T00:00:00,41.67222,-88.02777,0.0,300.1,247.0,,,,,15,,,,,A


In [10]:
import zipfile
import pandas as pd
import os

def load_csvs_from_zips(folder_path="/content/drive/My Drive/SIADS_593/assets/raw_data/"):

    all_dfs = []

    for file in os.listdir(folder_path):
        if file.endswith('.zip'):
            zip_path = os.path.join(folder_path, file)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                for csv_file in zip_ref.namelist():
                    if csv_file.endswith('.csv'):
                        with zip_ref.open(csv_file) as f:
                            df = pd.read_csv(f)
                            all_dfs.append(df)

    return pd.concat(all_dfs, ignore_index=True)


In [11]:
df = load_csvs_from_zips()  # uses the default folder



In [27]:
def extract_first_arrivals_near_port(df):
    # Define vessel type of interest
    vessel_filter = (
        df['VesselType'].isin(range(70, 90)) |  # Cargo + Tankers
        df['VesselType'].isin([30, 52])         # Fishing + Tug
    )

    # Define LAT/LON bounding box for Port of LA
    location_filter = (
        df['LAT'].between(33.6, 33.8) &
        df['LON'].between(-118.3, -118.1)
    )

    # Apply both filters
    port_df = df[vessel_filter & location_filter].copy()

    # Sort and get first record per MMSI (vessel)
    port_df['BaseDateTime'] = pd.to_datetime(port_df['BaseDateTime'])
    first_arrivals = (
        port_df.sort_values(by=['MMSI', 'BaseDateTime'])
               .drop_duplicates(subset='MMSI', keep='first')
    )

    return first_arrivals


In [28]:
import zipfile
import pandas as pd

def process_zip_in_chunks(zip_path, chunksize=100_000):
    all_cleaned_chunks = []

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Find the CSV file inside the zip
        csv_files = [name for name in zip_ref.namelist() if name.endswith('.csv')]
        if not csv_files:
            print(f"No CSV found in {zip_path}")
            return pd.DataFrame()  # Return empty DataFrame if nothing found

        # Open the first CSV
        with zip_ref.open(csv_files[0]) as f:
            reader = pd.read_csv(f, chunksize=chunksize)

            for i, chunk in enumerate(reader):
                print(f"Processing chunk {i} with {len(chunk)} rows")

                # Pull extract_first_arrivals_near_port function
                first_arrivals = extract_first_arrivals_near_port(chunk)

                # Append cleaned chunk to list
                all_cleaned_chunks.append(first_arrivals)

    # Combine all filtered chunks into one DataFrame
    return pd.concat(all_cleaned_chunks, ignore_index=True)



In [31]:
arrival_df_df = process_zip_in_chunks("/content/drive/My Drive/SIADS_593/assets/raw_data/AIS_2020_01_05.zip")
arrival_df.head()


Processing chunk 0 with 100000 rows
Processing chunk 1 with 100000 rows
Processing chunk 2 with 100000 rows
Processing chunk 3 with 100000 rows
Processing chunk 4 with 100000 rows
Processing chunk 5 with 100000 rows
Processing chunk 6 with 100000 rows
Processing chunk 7 with 100000 rows
Processing chunk 8 with 100000 rows
Processing chunk 9 with 100000 rows
Processing chunk 10 with 100000 rows
Processing chunk 11 with 100000 rows
Processing chunk 12 with 100000 rows
Processing chunk 13 with 100000 rows
Processing chunk 14 with 100000 rows
Processing chunk 15 with 100000 rows
Processing chunk 16 with 100000 rows
Processing chunk 17 with 100000 rows
Processing chunk 18 with 100000 rows
Processing chunk 19 with 100000 rows
Processing chunk 20 with 100000 rows
Processing chunk 21 with 100000 rows
Processing chunk 22 with 100000 rows
Processing chunk 23 with 100000 rows
Processing chunk 24 with 100000 rows
Processing chunk 25 with 100000 rows
Processing chunk 26 with 100000 rows
Processing 

Unnamed: 0,MMSI,VesselName,BaseDateTime,LAT,LON,SOG,COG,Heading,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,211327410,TOKYO EXPRESS,2020-01-05 00:04:00,33.76906,-118.23045,0.0,255.0,255.0,IMO9193290,DGTX,70.0,5.0,294.0,32.0,13.6,,A
1,212348000,HYUNDAI HONGKONG,2020-01-05 00:02:06,33.75465,-118.2315,0.0,39.0,70.0,IMO9305661,5BZL,70.0,5.0,303.0,40.0,14.0,70.0,A
2,218092000,HANOVER EXPRESS,2020-01-05 18:09:19,33.60215,-118.29297,9.6,83.8,85.0,IMO9343716,DFGX2,70.0,0.0,336.0,42.0,14.6,79.0,A
3,235060304,ENERGY CENTURION,2020-01-05 00:01:13,33.70358,-118.20347,0.0,3.0,266.0,IMO9387970,2AJU4,80.0,1.0,228.0,32.0,14.4,89.0,A
4,235076283,PARAMOUNT HYDRA,2020-01-05 00:02:32,33.68998,-118.12927,0.0,335.7,251.0,IMO9453999,2CWC3,80.0,1.0,249.0,44.0,14.8,,A
