In [13]:
import io
import os
import shutil
import tarfile
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.cloud import storage
from google.oauth2 import service_account

In [2]:
def download_from_drive(file_id, destination_file_name):
    credentials = service_account.Credentials.from_service_account_file(
        'credentials.json',
        scopes=['https://www.googleapis.com/auth/drive.readonly']
    )

    drive_service = build('drive', 'v3', credentials=credentials)

                          
    request = drive_service.files().get_media(fileId=file_id)
    file_stream = io.FileIO(destination_file_name, 'wb')
    downloader = MediaIoBaseDownload(file_stream, request)

    done = False
    while not done:
        status, done = downloader.next_chunk()

In [3]:
def extract_tar_bz2(source_file, destination_folder):
    with tarfile.open(source_file, 'r:bz2') as tar:
        tar.extractall(destination_folder)

In [4]:
def upload_to_gcs(bucket_name, source_folder, destination_folder):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    for root, _, files in os.walk(source_folder):
        for file in files:
            file_path = os.path.join(root, file)
            blob_name = os.path.join(destination_folder, os.path.relpath(file_path, source_folder))

            blob = bucket.blob(blob_name)
            blob.upload_from_filename(file_path)

In [11]:
def upload_to_gcs(bucket_name, source_file, destination_file):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(destination_file)
    blob.upload_from_filename(source_file)

def extract_and_upload_tar_bz2(source_file, destination_folder, bucket_name):
    # If the destination folder exists, remove it
    if os.path.exists(destination_folder):
        shutil.rmtree(destination_folder)

    # Create the destination folder
    os.makedirs(destination_folder)
    
    with tarfile.open(source_file, 'r:bz2') as tar:
        for member in tar:
            if member.isfile():
                member.name = os.path.basename(member.name)  # ensure only the file name is kept
                tar.extract(member, path=destination_folder)

                local_file = os.path.join(destination_folder, member.name)
                upload_to_gcs(bucket_name, local_file, member.name)
                os.remove(local_file)  # remove the file after upload

In [5]:
# Provide the Google Drive file ID and destination file name for download
file_id = '1cjY6HsHaSZuLVHywIxD5xQqng33J5S2b'
destination_file_name = 'downloaded_data.tar.bz2'

# Provide the GCS bucket name and destination folder path
bucket_name = 'fake-news-data'
destination_folder = 'fakeddit'

# Specify the folder path where the extracted files will be saved
extracted_folder = 'extracted_files'

In [None]:
download_from_drive(file_id, destination_file_name)
extract_and_upload_tar_bz2(destination_file_name, extracted_folder, bucket_name)