In [18]:
# !pip3 install dropbox
import ktrain
import numpy as np
import dropbox
import pandas as pd
# !pip install tqdm
import glob
from tqdm import tqdm as prog
import traceback

In [2]:
CATS=['HEALTHY_TALK', 'SICK_TALK']
import os
token = os.environ["DB_TOKEN"]

'/home/coler/projects/covid/covid_proj/creds.json'

In [12]:
if False:
    model_path = "./save_model/models/xlm-roberta-large"
    predictor = ktrain.load_predictor(model_path)
    test_lst = ["I feel terrible","Have you seen the latest Game of Thrones?"]
    probs = predictor.predict_proba(test_lst)
    print(probs)
    print(predictor.predict(test_lst))

In [93]:
def dropbox_connect():
    """Create a connection to Dropbox."""

    try:
        dbx = dropbox.Dropbox(TOKEN)
    except AuthError as e:
        print('Error connecting to Dropbox with access token: ' + str(e))
    return dbx


def dropbox_list_files(path):
    """Return a Pandas dataframe of files in a given Dropbox folder path in the Apps directory.
    """

    dbx = dropbox_connect()

    try:
        files = dbx.files_list_folder(path).entries
        files_list = []
        for file in files:
            if isinstance(file, dropbox.files.FileMetadata):
                metadata = {
                    'name': file.name,
                    'path_display': file.path_display,
                    'client_modified': file.client_modified,
                    'server_modified': file.server_modified
                }
                files_list.append(metadata)

        df = pd.DataFrame.from_records(files_list)
        return df.sort_values(by='server_modified', ascending=False), files

    except Exception as e:
        print('Error getting list of files from Dropbox: ' + str(e))
        
def dropbox_download_file(dropbox_file_path, local_file_path):
    """Download a file from Dropbox to the local machine."""

    try:
        dbx = dropbox_connect()

        with open(local_file_path, 'wb') as f:
            metadata, result = dbx.files_download(path=dropbox_file_path)
            f.write(result.content)
    except Exception as e:
        print('Error downloading file from Dropbox: ' + str(e))
        
df, files = dropbox_list_files('/twitter_data')
# dropbox_download_file(files[0].path_lower, "db_twitter_data_processed.zip")
# df,files

In [19]:
def loader(fn,
           text_col = "tweet_text"):
    try:
        dx = pd.read_csv(fn,low_memory=False)
    except Exception as e:
        dx = pd.read_csv(fn,low_memory=False,lineterminator='\n')
        
    # Drop NAN vals from tweet_text column
    dx = dx.dropna(subset=[text_col])
    dx = dx.loc[dx[text_col] != "",:]
    dx = dx.reset_index(drop=True,inplace=False)
    return dx
        
def run_predict(fns,
                dest_dir):
    # define errors list
    err_files = []
    
    # run apply
    for fn in prog(fns,desc="Progress running apply: "):
        try:
            dx = loader(fn)
            dy = pd.concat([dx,pd.DataFrame(predictor.predict_proba(dx.tweet_text.to_list()),columns=CATS)],axis=1)
            dy["predicted_cat"] = np.where(dy[CATS[1]] > 0.5,1,0)
            dy["predicted_cat_text"] = np.where(dy[CATS[1]] > 0.5,CATS[1],CATS[0])
            dy.to_csv(dest_dir.format(fn.split("/")[-1]),index=False)
        except Exception as e:
            st = traceback.format_exc()
            err_files += [(fn,e,st)]
            print(e)
    return err_files

def apply_predictor(
    input_dir,
    dest_dir
    ):
    # get file names
    fns = glob.glob(input_dir)
    errors = run_predict(fns,dest_dir)
    return errors
    

if False:
    errors = apply_predictor(
        "./data/original/twitter_data_processed/*.csv",
        "./data/twitter_data_scored/{}"
        )

In [105]:
#catch errors
orig = [fn.split("/")[-1] for fn in glob.glob("./data/original/twitter_data_processed/*.csv")]
scored = [fn.split("/")[-1] for fn in glob.glob("./data/twitter_data_scored/*.csv")]
diff = [fn for fn in orig if fn not in scored]
fns = [f"./data/original/twitter_data_processed/{fn}" for fn in diff]
if False:
    errors_two = run_predict(fns,"./data/twitter_data_scored/{}")

In [None]:
orig = [fn.split("/")[-1] for fn in glob.glob("./data/original/twitter_data_processed/*.csv")]
num_tweets = [loader(f"./data/original/twitter_data_processed/{fn}").shape[0] for fn in orig]


In [111]:
f'{sum(num_tweets):,}'

'169,233,509'

In [26]:
print(f"All files processed: {len(fns) == 0 and len(scored) == len(orig)}")

All files processed: True


In [81]:
import gc

# Aggregate by country, nuts_2_region, week, 
def load_data(
    in_dir
):
    scored_fns = [fn for fn in glob.glob(in_dir)]
    df = pd.DataFrame()
    for fn in prog(scored_fns,desc=f"Progress loading data for {in_dir.split('/')[-1].split('*')[0]}: "):
        in_dat = loader(fn)
        date = "-".join(fn.split("/")[-1].split(".")[0].split("_")[-3:])
        in_dat["date"] = pd.to_datetime(date,format="%Y-%m-%d")
        in_dat["week"] = in_dat.date.dt.isocalendar().week.apply(lambda w: f"W{w}")
        in_dat["year"] = in_dat.date.dt.year
        df = pd.concat([df,in_dat])
    return df 
        
def aggregate(in_dir="./data/twitter_data_scored/*.csv",
              out_dir="./data/twitter_data_aggregated/{}.csv"):
    fns = [fn for fn in glob.glob(in_dir)]
    countries = list(set("_".join(fn.split("/")[-1].split("_")[:-4]) for fn in fns))
    in_dir_ct = in_dir.replace("*","{}*")
    for ct in countries:
        dat = load_data(in_dir = in_dir_ct.format(ct))
        dat["country"] = ct
        agg_dat = dat.groupby(["country","nuts_2_region","year","week"]).\
            agg(sick_talk_proportion=('predicted_cat', 'mean'),
                num_tweets=('tweet_text', 'count')).\
            reset_index(drop=False,inplace=False)
        agg_dat.to_csv(out_dir.format(ct),index=False)
        gc.collect()
        del dat 
        del agg_dat

if False:
    aggregate()

In [84]:
import shutil
dir_name = "twitter_data_scored"
dir_name_agg = "twitter_data_aggregated"
fp = "./data/{}"

if False:
    shutil.make_archive(fp.format(dir_name_agg), 'zip', fp.format(dir_name_agg))
    shutil.make_archive(fp.format(dir_name), 'zip', fp.format(dir_name))

In [104]:
import pathlib
import os

def dropbox_upload(local_path, local_file, dropbox_file_path):
    """Upload a file from the local machine to a path in the Dropbox app directory.

    Args:
        local_path (str): The path to the local file.
        local_file (str): The name of the local file.
        dropbox_file_path (str): The path to the file in the Dropbox app directory.

    Example:
        dropbox_upload_file('.', 'test.csv', '/stuff/test.csv')

    Returns:
        meta: The Dropbox file metadata.
    """
    try:
        CHUNK_SIZE = 50 * 1024 * 1024
        dbx = dropbox_connect()
        local_file_path = pathlib.Path(local_path) / local_file
        file_size = os.path.getsize(local_file_path)

        with open(local_file_path, "rb") as f:
            if file_size <= CHUNK_SIZE:
                meta = dbx.files_upload(f.read(), dropbox_file_path, mode=dropbox.files.WriteMode("overwrite"))
            else:
                print("Uploading via session")
                upload_session_start_result = dbx.files_upload_session_start(f.read(CHUNK_SIZE))
                cursor = dropbox.files.UploadSessionCursor(session_id=upload_session_start_result.session_id,
                                                          offset=f.tell())
                commit = dropbox.files.CommitInfo(path=dropbox_file_path)

                while f.tell() < file_size:
                    if ((file_size - f.tell()) <= CHUNK_SIZE):
                        print(dbx.files_upload_session_finish(f.read(CHUNK_SIZE),
                                                       cursor,
                                                       commit))
                    else:
                        dbx.files_upload_session_append(f.read(CHUNK_SIZE),
                                                       cursor.session_id,
                                                       cursor.offset)
                        cursor.offset = f.tell()
    except Exception as e:
        print('Error uploading file to Dropbox: ' + str(e))

if True:
    fns = ["twitter_data_aggregated.zip","twitter_data_scored.zip"]
    for fn in fns:
        dropbox_upload("./data",fn,f"/twitter_data/{fn}")

Uploading via session
FileMetadata(client_modified=datetime.datetime(2023, 8, 3, 19, 11, 23), content_hash='0d8f4c4e677474259e122e639a52b1b50ab1b06899896d7a67cbffe66026a6f5', export_info=NOT_SET, file_lock_info=NOT_SET, has_explicit_shared_members=NOT_SET, id='id:gl16Bl3R7zwAAAAAAAAAHQ', is_downloadable=True, media_info=NOT_SET, name='twitter_data_scored.zip', parent_shared_folder_id=NOT_SET, path_display='/twitter_data/twitter_data_scored.zip', path_lower='/twitter_data/twitter_data_scored.zip', preview_url=NOT_SET, property_groups=NOT_SET, rev='602098c0f09db68e90343', server_modified=datetime.datetime(2023, 8, 3, 19, 11, 28), sharing_info=NOT_SET, size=20449001289, symlink_info=NOT_SET)
