# Stage 0 prepare raw data

The objectives are to:
1. Extract the columns relevant for the project.
2. Parse into Parquet format into the GCP Storage where it is ingested into the pipeline.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
# Mount to Google Drive to save results
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MSc/2020-21/Research\ Project/Colab/
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/MSc/2020-21/Research Project/Colab
 bharadwaj.csv                             reuse_content_domain_df.csv
 domain_count_df.csv                       reuse_content_url_df.csv
'fake_news_eda.ipynb - Colaboratory.pdf'   risdal.csv
 label_count_df.csv                        template.yml
 reuse_content_df.csv


In [3]:
# Connect to GCP Bucket
from google.colab import auth
auth.authenticate_user()

In [4]:
# Set GCP project ID and region to Europe West 2 - London
PROJECT = 'detect-fake-news-313201'
!gcloud config set project $PROJECT
REGION = 'europe-west2'
CLUSTER = '{}-cluster'.format(PROJECT)
!gcloud config set compute/region $REGION
!gcloud config set dataproc/region $REGION

!gcloud config list # show some information

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey

Updated property [compute/region].
Updated property [dataproc/region].
[component_manager]
disable_update_check = True
[compute]
gce_metadata_read_timeout_sec = 0
region = europe-west2
[core]
account = aaron.altrock@gmail.com
project = detect-fake-news-313201
[dataproc]
region = europe-west2

Your active configuration is: [default]


## Read in from Google Drive the original file(s)

In [59]:
# Read in from the Google Drive at mount point
src_file_nm = 'risdal.csv'
parquet_file_nm = 'risdal.parquet'
src_df = pd.read_csv(src_file_nm)
print('Dimension of {}: {} x {}'.format(src_file_nm, src_df.shape[0], src_df.shape[1]))

Dimension of risdal.csv: 12999 x 20


In [60]:
src_df.tail()

Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
12994,f1b5d0e44803f48732bde854a9fdf95837219b12,2,replaceme,2016-10-26T23:58:00.000+03:00,,It DOES allow you to put a dog face on top of ...,english,2016-10-27T00:37:46.194+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.0,,40,32,0,0,0,bs
12995,36011ceba3647e1bea78299b68b6fb705a1fc1ad,3,Freedumb,2016-10-27T00:02:00.000+03:00,,Wait till you see what happens to the valuatio...,english,2016-10-27T00:37:46.220+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.0,,40,32,0,0,0,bs
12996,6995d1aa9ac99926106489b14b5530e85358059a,4,major major maj...,2016-10-27T00:06:00.000+03:00,,I'm waiting for the one that puts a pussy on m...,english,2016-10-27T00:37:46.244+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.0,,40,32,0,0,0,bs
12997,7de8ae90eee164eb756db6c8a3772288e11d7a94,5,beemasters,2016-10-27T00:09:00.000+03:00,,$4 Billion even after they are known to be kee...,english,2016-10-27T00:37:46.247+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.0,,40,32,0,0,0,bs
12998,dabef7095b7d9dae6eb0d83c4cbb40b85efd7ae5,6,i&#039;m-confused,2016-10-27T00:09:00.000+03:00,,of course - how else would they disceminate te...,english,2016-10-27T00:37:46.260+03:00,zerohedge.com,US,2435.0,"Snapchat To Raise Up To $4 Billion In IPO, Val...",0.0,,40,32,0,0,0,bs


## Simple data profiling

In [61]:
# Profile of the data set
src_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
uuid,12999,12999.0,14c638b93f5bf350a3941b985ec1108cbe8405f1,1.0,,,,,,,
ord_in_thread,12999,,,,0.89153,6.48682,0.0,0.0,0.0,0.0,100.0
author,10575,2259.0,admin,247.0,,,,,,,
published,12999,11973.0,2016-10-27T03:00:00.000+03:00,59.0,,,,,,,
title,12319,11698.0,Get Ready For Civil Unrest: Survey Finds That ...,7.0,,,,,,,
text,12953,12431.0,,106.0,,,,,,,
language,12999,16.0,english,12403.0,,,,,,,
crawled,12999,12997.0,2016-10-26T22:16:26.842+03:00,2.0,,,,,,,
site_url,12999,244.0,pravdareport.com,100.0,,,,,,,
country,12823,24.0,US,10367.0,,,,,,,


In [62]:
# Distribution of the classification labels
src_df[['uuid', 'type']].groupby('type').count()\
  .reset_index()\
  .sort_values(by='uuid', ascending=False)

Unnamed: 0,type,uuid
1,bs,11492
0,bias,443
2,conspiracy,430
4,hate,246
6,satire,146
7,state,121
5,junksci,102
3,fake,19


## Transform data

Simple transformation to make it easier to ingest by GCP DataFlow pipeline.

In [63]:
# Remove any articles with no text
parsed_df = src_df[[True if pd.notnull(txt) else False for txt in src_df['text']]].copy()
print('Before: Dimension of {}: {} x {}'.format('src_df', src_df.shape[0], src_df.shape[1]))
print('After: Dimension of {}: {} x {}'.format('parsed_df', parsed_df.shape[0], parsed_df.shape[1]))
parsed_df.head()

Before: Dimension of src_df: 12999 x 20
After: Dimension of parsed_df: 12953 x 20


Unnamed: 0,uuid,ord_in_thread,author,published,title,text,language,crawled,site_url,country,domain_rank,thread_title,spam_score,main_img_url,replies_count,participants_count,likes,comments,shares,type
0,6a175f46bcd24d39b3e962ad0f29936721db70db,0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-27T01:49:27.168+03:00,100percentfedup.com,US,25689.0,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
1,2bdc29d12605ef9cf3f09f9875040a7113be5d5b,0,reasoning with facts,2016-10-29T08:47:11.259+03:00,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00,100percentfedup.com,US,25689.0,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
2,c70e149fdd53de5e61c29281100b9de0ed268bc3,0,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00,100percentfedup.com,US,25689.0,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,0,1,0,0,0,bias
3,7cf7c15731ac2a116dd7f629bd57ea468ed70284,0,Fed Up,2016-11-01T05:22:00.000+02:00,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T15:46:26.304+02:00,100percentfedup.com,US,25689.0,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias
4,0206b54719c7e241ffe0ad4315b808290dbe6c0f,0,Fed Up,2016-11-01T21:56:00.000+02:00,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T23:59:42.266+02:00,100percentfedup.com,US,25689.0,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,http://100percentfedup.com/wp-content/uploads/...,0,0,0,0,0,bias


## Prepare Config YAML file
Create a config file for the data set to give instruction to the data ingestion pipeline on GCP

In [64]:
import yaml

# Import the template YAML file
with open('./template.yml', 'r') as f_read:
  try:
    template_config_dct = yaml.safe_load(f_read)
  except Exception as e:
    print('Error: {}'.format(e))

In [65]:
# Preview the template YAML
template_config_dct

{'custom_columns': None,
 'mandatory_columns': {'body': {'column_name': ''},
  'label': {'column_name': ''},
  'title': {'column_name': ''},
  'url': {'column_name': ''}},
 'source': '',
 'supplementary_columns': {'author': {'column_name': ''},
  'classification_date': {'column_name': ''},
  'country_of_origin': {'column_name': ''},
  'detailed_news_label': {'column_name': ''},
  'language': {'column_name': ''},
  'publication_date': {'column_name': ''}}}

In [66]:
# Create a new dict based on the template to configure for the data set
parquet_config_dct = template_config_dct.copy()

# Mandatory columns
parquet_config_dct.update({'mandatory_columns': 
                            {
                                'body': {'column_name': 'text'},
                                'label': {'column_name': 'type'},
                                'title': {'column_name': 'title'},
                                'url': {'column_name': 'site_url'}
                             }
                           })

# Source of the data
parquet_config_dct.update({'source': 'https://www.kaggle.com/mrisdal/fake-news'})

# Supplementary columns
parquet_config_dct.update({'supplementary_columns': 
                            {
                                'author': {'column_name': 'author'},
                                'classification_date': {'column_name': 'crawled'},
                                'detailed_news_label': {'column_name': ''},
                                'language': {'column_name': 'language'},
                                'publication_date': {'column_name': 'published'},
                                'country_of_origin': {'column_name': 'country'}
                             }
                           })

# Custom columns
parquet_config_dct.update({'custom_columns': ['likes', 'comments', 'shares', 'replies_count', 'participants_count', 'spam_score', 'main_img_url']})

parquet_config_dct

{'custom_columns': ['likes',
  'comments',
  'shares',
  'replies_count',
  'participants_count',
  'spam_score',
  'main_img_url'],
 'mandatory_columns': {'body': {'column_name': 'text'},
  'label': {'column_name': 'type'},
  'title': {'column_name': 'title'},
  'url': {'column_name': 'site_url'}},
 'source': 'https://www.kaggle.com/mrisdal/fake-news',
 'supplementary_columns': {'author': {'column_name': 'author'},
  'classification_date': {'column_name': 'crawled'},
  'country_of_origin': {'column_name': 'country'},
  'detailed_news_label': {'column_name': ''},
  'language': {'column_name': 'language'},
  'publication_date': {'column_name': 'published'}}}

## Export files to Google GCP where Data Flow will ingest the data

In [82]:
# Set destinations
dest_gcp_bucket_nm = 'gs://src_fake_news_bs/to_add'
dest_yaml_file_nm = parquet_file_nm + '.yml'  # YAML file name must match the same as the corresponding parquet file name, including the file type suffix
dest_yaml_path = dest_gcp_bucket_nm + '/' + dest_yaml_file_nm
dest_parquet_path = dest_gcp_bucket_nm + '/' + parquet_file_nm

In [83]:
# Copy YAML file
# Save to Google Drive mount point then copy to the GCP bucket
with open(dest_yaml_file_nm, 'w') as outfile:
    yaml.dump(parquet_config_dct, outfile, default_flow_style=False)

# Gsutil to copy to GCP Cloud Storage
!gsutil cp $dest_yaml_file_nm $dest_yaml_path

Copying file://risdal.parquet.yml [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  582.0 B]                                                / [1 files][  582.0 B/  582.0 B]                                                
Operation completed over 1 objects/582.0 B.                                      


In [85]:
# Copy Parquet file
# Save to Google Drive mount point then copy to the GCP bucket
parsed_df.to_parquet(parquet_file_nm)

# Gsutil to copy to GCP Cloud Storage
!gsutil cp $parquet_file_nm $dest_parquet_path

Copying file://risdal.parquet [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/31.2 MiB.                                     
