In [1]:
# Import packages
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from threading import Thread
from queue import Queue
import os
import re
import polars as pl
from googleapiclient.http import MediaIoBaseDownload
from io import BytesIO
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pickle


In [None]:
pip install google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2

In [2]:
# Scopes for accessing Google Drive
SCOPES = ['https://www.googleapis.com/auth/drive']

# Authenticate and create the service object
def authenticate_drive_api():
    creds = None
    # Token file for saving the authentication
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no credentials, perform authentication
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'client_secrets.json', SCOPES)  # Ensure 'credentials.json' is downloaded from Google API Console
            creds = flow.run_local_server(port=0)
        # Save the credentials for future use
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    return build('drive', 'v3', credentials=creds)

# Initialize the service object
service = authenticate_drive_api()


In [3]:
# Get the list of files in the folder
folder_id = '14idmMBbM5xXZg4b61iINHbBTl2z4yLeD'
response = service.files().list(
    q=f"'{folder_id}' in parents and mimeType='text/plain'",
    fields='files(id, name)',
    pageSize=1000
).execute()
files = response.get('files', [])

In [6]:
files

[{'id': '1nXoFmhH9wB3HMXAxKKEERcATzAMrV43H',
  'name': 'WPAC_20_25_Vamco_WWLLN_Locations.txt'},
 {'id': '1kCWLJWnz9P0RC57l8g4fYhQYq34Sq06N',
  'name': 'WPAC_20_25_Vamco_Reduced_Trackfile.txt'},
 {'id': '12y-vgX3hMEnqCmXR4KGPLcMZeY-NkJug',
  'name': 'WPAC_20_22_Goni_WWLLN_Locations.txt'},
 {'id': '13K0Lw1cq8Os5IIW3zaawAt0ScmXI6yJ7',
  'name': 'WPAC_20_22_Goni_Reduced_Trackfile.txt'},
 {'id': '1Iq8MnfupqMQtDm92cBq7WxMUjEYiEj1v',
  'name': 'WPAC_20_14_Dolphin_Reduced_Trackfile.txt'},
 {'id': '1EC_xVORvUtkhnzossRBqNuDS8C4mKPXd',
  'name': 'WPAC_20_14_Dolphin_WWLLN_Locations.txt'},
 {'id': '10o9jVYWMkS-HKfom5EHLfwWCJYR1x_sH',
  'name': 'WPAC_20_5_Jangmi_WWLLN_Locations.txt'},
 {'id': '1bocWVsx8GBnS9F9nN0IPBnwArELyiMse',
  'name': 'WPAC_20_13_Noul_WWLLN_Locations.txt'},
 {'id': '1RrpzlFCS23BQH4ZosV5wLTN_OBg-s966',
  'name': 'WPAC_20_5_Jangmi_Reduced_Trackfile.txt'},
 {'id': '1w2FeFhEeIQ5woj15mgAUCtizElykqApt',
  'name': 'WPAC_20_13_Noul_Reduced_Trackfile.txt'},
 {'id': '1WdQ9EIXHt-dZOQxzOwui

In [8]:
# Process each file to add cyclone id
import os
import polars as pl
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
from io import BytesIO

# Directory to save the processed files locally
output_dir = "processed_files"
os.makedirs(output_dir, exist_ok=True)

# Process each file
for file in files:
    file_id = file['id']
    file_name = file['name']
    
    # Extract the prefix (cyclone ID) from the filename
    prefix = '_'.join(file_name.split('_')[:3])  
    
    # Download the file content
    request = service.files().get_media(fileId=file_id)
    file_stream = BytesIO()
    downloader = MediaIoBaseDownload(file_stream, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    file_stream.seek(0)
    content = file_stream.read().decode('utf-8')
    
    # Add the prefix as a new column using Polars
  
    df = pl.read_csv(BytesIO(content.encode('utf-8')),separator='\t', has_header=False)
    df = df.with_columns(pl.lit(prefix))  

    # Save the modified DataFrame locally
    output_file_path = os.path.join(output_dir, file_name)
    df.write_csv(output_file_path, separator='\t',include_header=False) 
    
    print(f"Processed and saved: {output_file_path}")



Processed and saved: processed_files\WPAC_20_25_Vamco_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_25_Vamco_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_22_Goni_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_22_Goni_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_14_Dolphin_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_14_Dolphin_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_5_Jangmi_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_13_Noul_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_5_Jangmi_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_13_Noul_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_2_Nuri_WWLLN_Locations.txt
Processed and saved: processed_files\WPAC_20_2_Nuri_Reduced_Trackfile.txt
Processed and saved: processed_files\WPAC_20_12_Twelve_WWLLN_Locations.txt
Processed and saved: processe

In [9]:
import glob

# Directories for processed files and output
input_dir = "processed_files"
output_dir = "combined_files"
os.makedirs(output_dir, exist_ok=True)

# File patterns to combine
patterns = {
    "Reduced_Trackfile": os.path.join(input_dir, "*Reduced_Trackfile*.txt"),
    "WWLLN_Locations": os.path.join(input_dir, "*WWLLN_Locations*.txt")
}

# Combine files based on patterns
for pattern_name, pattern_path in patterns.items():
    combined_content = []
    output_file_path = os.path.join(output_dir, f"Combined_{pattern_name}.txt")
    
    # Find all matching files
    matching_files = glob.glob(pattern_path)
    print(f"Combining {len(matching_files)} files for pattern '{pattern_name}'...")
    
    for file_path in matching_files:
        with open(file_path, "r") as file:
            content = file.read()
            combined_content.append(content)
    
    # Write the combined content to a single file
    with open(output_file_path, "w") as output_file:
        output_file.write("\n".join(combined_content))
    
    print(f"Combined file saved: {output_file_path}")


Combining 499 files for pattern 'Reduced_Trackfile'...
Combined file saved: combined_files\Combined_Reduced_Trackfile.txt
Combining 501 files for pattern 'WWLLN_Locations'...
Combined file saved: combined_files\Combined_WWLLN_Locations.txt
