In [2]:
!pip install google-cloud-storage

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import os
from io import StringIO
import pandas as pd
from google.cloud import storage
from glob import glob
from tqdm.notebook import tqdm as bar

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"/home/coler/projects/covid/covid_proj/creds.json"

In [4]:
# define function that list buckets available
def list_buckets(): 
    storage_client = storage.Client()

    file_list = storage_client.list_buckets()
    file_list = [file.name for file in file_list]

    return file_list

list_buckets()

['covid-proj-data']

In [5]:
def load_file_to_pandas(bucket_name, file_name): 
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(file_name)
    bytes_data = blob.download_as_string()
    data = StringIO(str(bytes_data,'utf-8')) 
    
    return pd.read_csv(data)

load_file_to_pandas("covid-proj-data",'aggregated/Spain.csv').head()

Unnamed: 0,country,nuts_2_region,year,week,sick_talk_proportion,num_tweets
0,Spain,0,2016,W44,0.0,2
1,Spain,0,2016,W45,0.0,7
2,Spain,0,2016,W46,0.0,4
3,Spain,0,2016,W47,0.0,7
4,Spain,0,2016,W48,0.0,2


In [6]:
# define function that uploads a file from the bucket
def upload_cs_file(bucket_name, source_file_name, destination_file_name): 
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(destination_file_name)
    blob.upload_from_filename(source_file_name)

    return True

def upload_wrapper(source_file_name):
    destination_file_name = f"unaggregated/{source_file_name.split('/')[-1]}"
    try:
        upload_cs_file("covid-proj-data", source_file_name, destination_file_name)
    except Exception as e:
        print(e)
    
if False: # Only run once to upload my local files to the GCS bucket
    source_file_names = [fn for fn in glob("./data/twitter_data_scored/*.csv")]
    for source_file_name in bar(source_file_names, desc="Progress Uploading Files: "):
        upload_wrapper(source_file_name)
        

In [9]:
# Example download

# define function that list files in the bucket
def list_cs_files(bucket_name): 
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    file_list = bucket.list_blobs()
    file_list = [file.name for file in file_list]

    return file_list

source_file_names = list_cs_files("covid-proj-data")
unaggregated_source_file_names = [x for x in source_file_names if x.startswith("unaggregated") and x != "unaggregated/"]
unaggregated_source_file_names[:10]

['unaggregated/France_tweets_2016_11_01.csv',
 'unaggregated/France_tweets_2016_11_02.csv',
 'unaggregated/France_tweets_2016_11_06.csv',
 'unaggregated/France_tweets_2016_11_07.csv',
 'unaggregated/France_tweets_2016_11_08.csv',
 'unaggregated/France_tweets_2016_11_09.csv',
 'unaggregated/France_tweets_2016_11_10.csv',
 'unaggregated/France_tweets_2016_11_12.csv',
 'unaggregated/France_tweets_2016_11_14.csv',
 'unaggregated/France_tweets_2016_11_15.csv']

In [20]:
# define function that downloads a file from the bucket
def download_cs_file(bucket_name, cloud_file_name, local_file_name): 
    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(cloud_file_name)
    blob.download_to_filename(local_file_name)
    return True

# Wrapper to dl files 
def download_wrapper(source_file_name, local_download_dir):
    local_file_name = source_file_name.replace("unaggregated/",local_download_dir)
    try:
        download_cs_file("covid-proj-data", source_file_name, local_file_name)
    except Exception as e:
        print(e)

if False: #Set to true to download ALL files
    # Download all files (takes a couple of hours)
    for source_file_name in bar(unaggregated_source_file_names[:3], desc="Progress downloading"): # remove [:3] to download everything
        download_wrapper(source_file_name, "example_download/") #change this to data/scored_data to DL into the data (not example) directory

Progress downloading:   0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
# Check downloads 
for x in glob("example_download/*.csv"):
    print(x)

example_download/France_tweets_2016_11_02.csv
example_download/France_tweets_2016_11_06.csv
example_download/France_tweets_2016_11_01.csv
