#### Prerequisites

* Switch instance type to `ml.m5.large`

### I. Imports 

In [None]:
from sagemaker import Session
from pandas import DataFrame
from time import sleep
import pandas as pd
import sagemaker
import boto3

##### Essentials

In [None]:
session = Session()
s3 = boto3.resource('s3')

S3_BUCKET = session.default_bucket()
S3_DATA_FOLDER = 'data'

### II. Load and prepare data 

In [None]:
%%time

df = pd.read_csv('s3://sagemaker-us-east-1-119174016168/datasets/covid19_articles_20220420.csv')

In [None]:
df.shape

In [None]:
df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True)

In [None]:
df.drop(['author', 'date', 'domain', 'companies', 'locations', 'sdgs'], axis=1, inplace=True)

In [None]:
def clean(article: list) -> str:
    return '\n'.join(eval(article))

In [None]:
df['content'] = df['content'].apply(clean)

In [None]:
df.rename(columns={'datatype': 'category'}, inplace=True)

In [None]:
df.head()

### III. Prepare dataset for BERT MLM training 

In [None]:
mlm_df = df[['title', 'content']]

In [None]:
mlm_df.head()

In [None]:
def save_as_txt(df: DataFrame) -> None:
    fout = open('./data/covid_articles.txt', 'w')
    for title, content in zip(df.title.values, df.content.values):
        fout.write('\n'.join([title, content]))
    fout.close()

In [None]:
%%time

save_as_txt(df)

In [None]:
sleep(5)

##### Copy dataset from local to S3

In [None]:
s3.meta.client.upload_file('./data/covid_articles.txt', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles.txt')

### IV. Prepare dataset for sequence classification 

In [None]:
clf_df = df.dropna()

In [None]:
def combine(title: str, content: str) -> str:
    return '\n'.join([title, content])

In [None]:
clf_df['article'] = clf_df.apply(lambda x: combine(x['title'], x['content']), axis=1)

In [None]:
clf_df.drop(['title', 'content'], axis=1, inplace=True)

In [None]:
clf_df.head()

In [None]:
assert len(set(clf_df.category.values)) == 5

##### Copy dataset from local to S3 

In [None]:
clf_df.to_csv('./data/covid_articles_clf_data.csv', index=False)

In [None]:
sleep(5)

In [None]:
s3.meta.client.upload_file('./data/covid_articles_clf_data.txt', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles_clf_data.txt')