# Phase Two - Create a set of raw data of different sample sizes for benchmarking

The objectives are to:
1. Extract the columns relevant for the project.
2. Parse into Parquet format into the GCP Storage where it is ingested into the pipeline.

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Mount to Google Drive to save results
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MSc/2020-21/Research\ Project/Colab/
%ls

In [None]:
# Connect to GCP Bucket
from google.colab import auth
auth.authenticate_user()

In [None]:
# Set GCP project ID and region to Europe West 2 - London
PROJECT = 'fake-news-bs-detector'
!gcloud config set project $PROJECT
REGION = 'europe-west2'
CLUSTER = '{}-cluster'.format(PROJECT)
!gcloud config set compute/region $REGION
!gcloud config set dataproc/region $REGION

!gcloud config list # show some information

## Read in from Google Drive the original file(s)

In [None]:
# Read in from the Google Drive at mount point
src_file_nm = 'risdal.csv'
# parquet_file_nm = 'risdal.parquet'
src_df = pd.read_csv(src_file_nm)
print('Dimension of {}: {} x {}'.format(src_file_nm, src_df.shape[0], src_df.shape[1]))

In [None]:
src_df.tail()

## Simple data profiling

In [None]:
# Profile of the data set
src_df.describe(include='all').T

## Randomly sample the raw data to create new data sets for 1,250, 2,500, 5,000, 7,500, 10,000 observations

## Transform data

Simple transformation to make it easier to ingest by GCP DataFlow pipeline.

In [None]:
# Remove any articles with no text
parsed_df = src_df[[True if pd.notnull(txt) else False for txt in src_df['text']]].copy()
print('Before: Dimension of {}: {} x {}'.format('src_df', src_df.shape[0], src_df.shape[1]))
print('After: Dimension of {}: {} x {}'.format('parsed_df', parsed_df.shape[0], parsed_df.shape[1]))
parsed_df.head()

In [None]:
# Add file name column required for ingestion
parsed_df['file_name'] = [parquet_file_nm] * parsed_df.shape[0]

## Prepare Config YAML file
Create a config file for the data set to give instruction to the data ingestion pipeline on GCP

In [None]:
import yaml

# Import the template YAML file
with open('./template.yml', 'r') as f_read:
  try:
    template_config_dct = yaml.safe_load(f_read)
  except Exception as e:
    print('Error: {}'.format(e))

In [None]:
# Preview the template YAML
template_config_dct

In [None]:
# Create a new dict based on the template to configure for the data set
parquet_config_dct = template_config_dct.copy()

# Mandatory columns
parquet_config_dct.update({'mandatory_columns': 
                            {
                                'body': {'column_name': 'text'},
                                'label': {'column_name': 'type'},
                                'title': {'column_name': 'title'},
                                'url': {'column_name': 'site_url'},
                                'file_name': {'column_name': 'file_name'}
                             }
                           })

# Source of the data
parquet_config_dct.update({'source': 'https://www.kaggle.com/mrisdal/fake-news'})

# Supplementary columns
parquet_config_dct.update({'supplementary_columns': 
                            {
                                'author': {'column_name': 'author'},
                                'classification_date': {'column_name': 'crawled'},
                                'detailed_news_label': {'column_name': ''},
                                'language': {'column_name': 'language'},
                                'publication_date': {'column_name': 'published'},
                                'country_of_origin': {'column_name': 'country'}
                             }
                           })

# Custom columns
parquet_config_dct.update({'custom_columns': ['likes', 'comments', 'shares', 'replies_count', 'participants_count', 'spam_score', 'main_img_url']})

parquet_config_dct

In [None]:
# Sample according to the required sizes for the benchmarking
sample_size_ls = [1250, 2500, 5000, 7500, 10000]

for sample_sz in sample_size_ls:
  # Set destinations
  __df = parsed_df.copy()
  __df = __df.sample(sample_sz).copy()
  print('Dimension: {} x {}'.format(__df.shape[0], __df.shape[1]))
  dest_gcp_bucket_nm = 'gs://src_fake_news_bs/to_add'
  parquet_file_nm = 'risdal' + '_' + str(sample_sz) + '.parquet'
  dest_yaml_file_nm = parquet_file_nm + '_' + str(sample_sz) + '.yml'  # YAML file name must match the same as the corresponding parquet file name, including the file type suffix
  dest_yaml_path = dest_gcp_bucket_nm + '/' + dest_yaml_file_nm
  dest_parquet_path = dest_gcp_bucket_nm + '/' + parquet_file_nm

  # Copy YAML file
  # Save to Google Drive mount point then copy to the GCP bucket
  with open(dest_yaml_file_nm, 'w') as outfile:
      yaml.dump(parquet_config_dct, outfile, default_flow_style=False)

  # Gsutil to copy to GCP Cloud Storage
  !gsutil cp $dest_yaml_file_nm $dest_yaml_path

  # Copy Parquet file
  # Save to Google Drive mount point then copy to the GCP bucket
  __df.to_parquet(parquet_file_nm)

  # Gsutil to copy to GCP Cloud Storage
  !gsutil cp $parquet_file_nm $dest_parquet_path
