In [58]:
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from typing import Any, List, Dict, Union
import pandas as pd

In [20]:
drive_folders = {
    '2021' : '0AEyh-w2iKkdPUk9PVA',
    '2018' : '0ADODK3buTEqsUk9PVA',
    '2019' : '0AKpWxivcQgJbUk9PVA',
    '2020' : '0AGqVTkC6YXbFUk9PVA',
    '2022' : '0AFNNPsrSaDEGUk9PVA',
    '2024' : '0ADYtJk1I0uR_Uk9PVA',
    '2025' : '0AGhLXRXVGCy1Uk9PVA'
}

In [116]:
token_path = "token.json"
scopes = ["https://www.googleapis.com/auth/drive.readonly"]
folder_id = drive_folders['2018']

creds = Credentials.from_authorized_user_file(
    str(token_path), scopes
)

service = build("drive", "v3", credentials=creds)

In [117]:
ALLOWED_TYPES = set(
    ['application/vnd.google-apps.presentation', 
    'application/vnd.google-apps.spreadsheet',
    'application/vnd.google-apps.document',
    'application/pdf',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
)

In [118]:
"application/pdf" in ALLOWED_TYPES

True

In [119]:
def fetch_files_recursive(
    service: Any, folder_id: str
) -> List[Dict[str, Union[str, List[str]]]]:
    """Fetch all files and subfolders recursively."""
    results = (
        service.files()
        .list(
            q=f"'{folder_id}' in parents",
            pageSize=1000,
            includeItemsFromAllDrives=True,
            supportsAllDrives=True,
            fields="nextPageToken, files(id, name, mimeType, parents, trashed)",
        )
        .execute()
    )
    files = results.get("files", [])
    returns = []
    for file in files:
        if file["mimeType"] == "application/vnd.google-apps.folder":
            returns.extend(fetch_files_recursive(service, file["id"]))
        else:
            if file["mimeType"] in ALLOWED_TYPES:
                print(file)
                returns.append(file)
    return returns

In [120]:
files_2018 = fetch_files_recursive(service, folder_id)

{'mimeType': 'application/vnd.google-apps.presentation', 'parents': ['15hSlZp-4v3RSI_P5QWcdlTfeibofLlFi'], 'id': '1Vv66Zx0prJoAm06gDjzKD-Hwu29FwFh2T1qGFcsP8sk', 'name': 'Aaltoes | Deck', 'trashed': False}
{'mimeType': 'application/vnd.google-apps.presentation', 'parents': ['15hSlZp-4v3RSI_P5QWcdlTfeibofLlFi'], 'id': '15dLirx6WhNDIFZMbD0pURwnq-S-L0S8nHeTBvXh3EZU', 'name': 'Ecosystem| Deck', 'trashed': False}
{'mimeType': 'application/vnd.google-apps.presentation', 'parents': ['15hSlZp-4v3RSI_P5QWcdlTfeibofLlFi'], 'id': '1jwyWTCQhrmQ7B-dTWv-nkmXQaovxGYa-R-TVECHTqho', 'name': 'Aaltoes | Deck', 'trashed': False}
{'mimeType': 'application/vnd.google-apps.document', 'parents': ['15hSlZp-4v3RSI_P5QWcdlTfeibofLlFi'], 'id': '1o4YOUlHO9ie_Yp0PA4UwctosNURURJtUuQhG90rJIds', 'name': 'SUTD', 'trashed': False}
{'mimeType': 'application/vnd.google-apps.spreadsheet', 'parents': ['12cj81Zqs4cSmX3U3yppirfUOo182m-Hx'], 'id': '1PpqeYYsbyoo84mtdYohPOFgwIyLJB4KehlezWnnRepM', 'name': 'Aaltoes trip to Californ

In [121]:
import pandas as pd

# Example data
df = pd.DataFrame(files_2018.copy())

def get_extension(file):
    temp = file['name'].split('.')[-1]
    if temp in {'docx', 'xlsx', 'pptx', 'pdf'}:
        return file['id'] + "_" + file['name'].split('/')[-1]
    
    mime_to_ext = {
        'application/vnd.google-apps.document': 'docx',        
        'application/vnd.google-apps.spreadsheet': 'xlsx',      
        'application/vnd.google-apps.presentation': 'pptx',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', 
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
        'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', 
        'application/pdf': 'pdf'
    }
    return file['id'] + "_" + file['name'].split('/')[-1] + "." + mime_to_ext[file['mimeType']]

# Apply get_extension to each row and update the 'name' column
df['name'] = df.apply(get_extension, axis=1)


In [None]:
import io
import os
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.errors import HttpError

# Mapping for native Google file types to primary export MIME types and file extensions.
# Exclude presentations because they are always exported as PDF.
export_mime_types = {
    'application/vnd.google-apps.document': (
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'docx'
    ),
    'application/vnd.google-apps.spreadsheet': (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'xlsx'
    )
}

# Alternative export mapping: exporting as PDF for all native Google files.
alternative_export_mime_types = {
    'application/vnd.google-apps.document': ('application/pdf', 'pdf'),
    'application/vnd.google-apps.spreadsheet': ('application/pdf', 'pdf'),
    'application/vnd.google-apps.presentation': ('application/pdf', 'pdf')
}

for index, row in df.iterrows():
    file_id = row['id']
    output_filename = row['name']  # This may include folder(s), e.g., "folder/file"
    mimeType = row['mimeType']

    try:
        if mimeType == 'application/vnd.google-apps.presentation':
            # Always export presentations as PDF.
            export_mime, ext = alternative_export_mime_types[mimeType]
            # Remove any existing extension and add PDF extension.
            base_name = os.path.splitext(output_filename)[0]
            output_filename = f"{base_name}.{ext}"
            request = service.files().export_media(fileId=file_id, mimeType=export_mime)
        elif mimeType in export_mime_types:
            export_mime, ext = export_mime_types[mimeType]
            # Append the extension if it's not already present.
            if not output_filename.endswith(ext):
                output_filename = f"{output_filename}.{ext}"
            request = service.files().export_media(fileId=file_id, mimeType=export_mime)
        else:
            # For non-native (binary) files.
            request = service.files().get_media(fileId=file_id)

        # Construct the full output path (base folder "2021" plus any nested folders in output_filename).
        full_path = os.path.join("2021", output_filename)
        # If the file already exists, skip downloading.
        if os.path.exists(full_path):
            print(f"File '{full_path}' already exists. Skipping download.")
            continue

        # Prepare to download the file.
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while not done:
            status, done = downloader.next_chunk()
            print(f"Download {int(status.progress() * 100)}% for file '{output_filename}'.")

        # Create the directory structure if it doesn't exist.
        directory = os.path.dirname(full_path)
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        # Write the downloaded file to disk.
        with open(full_path, "wb") as f:
            f.write(fh.getvalue())

    except HttpError as error:
        error_content = error.content.decode() if hasattr(error, 'content') else str(error)
        # For non-presentation files, if primary export fails due to size limit, try alternative export.
        if ("exportSizeLimitExceeded" in error_content and
            mimeType in alternative_export_mime_types and
            mimeType != 'application/vnd.google-apps.presentation'):
            print(f"Primary export failed for '{output_filename}' due to size limit. Trying alternative export as PDF.")
            alt_export_mime, alt_ext = alternative_export_mime_types[mimeType]
            base_name = os.path.splitext(output_filename)[0]
            output_filename = f"{base_name}.{alt_ext}"
            alt_full_path = os.path.join("2021", output_filename)
            if os.path.exists(alt_full_path):
                print(f"File '{alt_full_path}' already exists. Skipping alternative export.")
                continue
            try:
                request = service.files().export_media(fileId=file_id, mimeType=alt_export_mime)
                fh = io.BytesIO()
                downloader = MediaIoBaseDownload(fh, request)
                done = False
                while not done:
                    status, done = downloader.next_chunk()
                    print(f"Alternative download {int(status.progress() * 100)}% for file '{output_filename}'.")
                
                directory = os.path.dirname(alt_full_path)
                if not os.path.exists(directory):
                    os.makedirs(directory)
                
                with open(alt_full_path, "wb") as f:
                    f.write(fh.getvalue())
            except HttpError as alt_error:
                print(f"Alternative export also failed for file '{output_filename}' (ID: {file_id}): {alt_error}")
        else:
            print(f"Error downloading file '{output_filename}' (ID: {file_id}): {error}")
