In [None]:
from google.cloud import storage
from io import BytesIO

import pandas as pd

## Read data from GCS

In [2]:
def read_data_from_gcs(bucket_name, file_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = storage.Blob(file_name, bucket)
    content = blob.download_as_text()  # downloads the file as text
    data = pd.read_csv(BytesIO(content), sep='\t')  # convert to dataframe
    return data

In [3]:
bucket_name = 'fake-news-data'

comments_df = pd.read_csv(f'gs://{bucket_name}/all_comments.tsv', sep='\t')
train_df = pd.read_csv(f'gs://{bucket_name}/multimodal_train.tsv', sep='\t')
validate_df = pd.read_csv(f'gs://{bucket_name}/multimodal_validate.tsv', sep='\t')
test_df = pd.read_csv(f'gs://{bucket_name}/multimodal_test_public.tsv', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
def list_blobs(bucket_name, prefix):
    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    return [blob.name.replace(prefix, '') for blob in blobs]

downloaded_images = list_blobs('fake-news-data', 'images/public_image_set/')

In [5]:
downloaded_image_ids = [img.split('.')[0] for img in downloaded_images]

## Keep only rows that have a corresponding image associated with them

In [6]:
train_df = train_df[train_df['id'].isin(downloaded_image_ids)]
validate_df = validate_df[validate_df['id'].isin(downloaded_image_ids)]
test_df = test_df[test_df['id'].isin(downloaded_image_ids)]

## Check for NULL values

In [7]:
train_df.isna().sum()

author                   20276
clean_title                  0
created_utc                  0
domain                  118222
hasImage                     0
id                           0
image_url                 1091
linked_submission_id    278239
num_comments            118222
score                        0
subreddit                    0
title                        0
upvote_ratio            118222
2_way_label                  0
3_way_label                  0
6_way_label                  0
dtype: int64

## Add comments related features to train, test and validation data frames

In [10]:
def merge_df(df, features):
    for feature in features:
        df = pd.merge(df, feature, left_on='id', right_on='submission_id', how='left')
    return df.drop(columns=['submission_id_x', 'submission_id_y'])

In [11]:
# total upvotes
total_upvotes = comments_df.groupby('submission_id')['ups'].sum().reset_index(name='total_upvotes')

# average upvotes
avg_upvotes = comments_df.groupby('submission_id')['ups'].mean().reset_index(name='avg_upvotes')

# most upvoted comment
most_upvoted_comment = comments_df.loc[comments_df.groupby('submission_id')['ups'].idxmax()][['submission_id', 'body', 'ups']]
most_upvoted_comment.columns = ['submission_id', 'most_upvoted_comment', 'upvotes_most_upvoted_comment']

# merge these features with the main dataframes
features = [total_upvotes, avg_upvotes, most_upvoted_comment]

train_df = merge_df(train_df, features)
test_df = merge_df(test_df, features)
validate_df = merge_df(validate_df, features)

In [12]:
train_df.head()

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,...,title,upvote_ratio,2_way_label,3_way_label,6_way_label,total_upvotes,avg_upvotes,submission_id,most_upvoted_comment,upvotes_most_upvoted_comment
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,...,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0,6.0,3.0,awxhir,Does it help with Dyslexia?,3.0
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,...,This concerned sink with a tiny hat,0.99,0,2,2,6.0,3.0,98pbid,“Does this hat make my head look too big?”,4.0
2,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,...,PsBattle: Puppy taking in the view,0.95,1,0,0,119.0,4.576923,4xypkv,[EVERYTHING THE LIGHT TOUCHES IS OUR KINGDOM](...,22.0
3,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,...,I found a face in my sheet music too!,0.84,0,2,2,3.0,1.5,8gnet9,That crescendo part at the top also kind of lo...,2.0
4,CrimsonBlue90,bride and groom exchange vows after fatal shoo...,1423681000.0,independent.ie,True,2vkbtj,https://external-preview.redd.it/FQ-J9OIPFRpqi...,,7.0,6,...,Bride and groom exchange vows after fatal shoo...,0.64,1,0,0,7.0,1.0,2vkbtj,"""We planned this weding for more than a year. ...",5.0


## Upload processed dfs to GCS

In [13]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )

# save the dataframes to CSVs
train_df.to_csv('train_df.csv', index=False)
validate_df.to_csv('validate_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

# upload the CSVs to GCS
upload_blob('fake-news-data', 'train_df.csv', 'multimodal_train_processed.csv')
upload_blob('fake-news-data', 'validate_df.csv', 'multimodal_validate_processed.csv')
upload_blob('fake-news-data', 'test_df.csv', 'multimodal_test_public_processed.csv')

File train_df.csv uploaded to multimodal_train_processed.csv.
File validate_df.csv uploaded to multimodal_validate_processed.csv.
File test_df.csv uploaded to multimodal_test_public_processed.csv.
