#### Prerequisites

* Choose `Switch instance type` above
* Toggle `Fast launch only` to select more types of instances
* Change instance type to `ml.m5.2xlarge`
* For Kernel, choose `Python 3 (Data Science)

### I. Imports 

In [2]:
from sklearn.preprocessing import LabelEncoder
from sagemaker import Session
from pandas import DataFrame
from time import sleep
import pandas as pd
import sagemaker
import boto3

##### Essentials

In [3]:
session = Session()
s3 = boto3.resource('s3')

S3_BUCKET = session.default_bucket()
S3_DATA_FOLDER = 'data'

### II. Load and prepare data 

In [4]:
%%time

df = pd.read_csv('s3://sagemaker-us-east-1-119174016168/datasets/covid19_articles_20220420.csv')

CPU times: user 31.6 s, sys: 7.6 s, total: 39.2 s
Wall time: 2min 11s


In [5]:
df.shape

(477551, 10)

In [6]:
df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True)

In [7]:
df.drop(['author', 'date', 'domain', 'companies', 'locations', 'sdgs'], axis=1, inplace=True)

In [8]:
def clean(article: list) -> str:
    return '\n'.join(eval(article))

In [9]:
df['content'] = df['content'].apply(clean)

In [10]:
df.rename(columns={'datatype': 'category'}, inplace=True)

In [11]:
df.head()

Unnamed: 0,title,content,category
0,Looking into the truth about modern workplace ...,"Hi, what are you looking for?\nBy\nPublished\n...",general
1,Hexo refiles financial statements,"New York reported a record 90,132 new Covid-19...",general
2,"Japan raid, Turkey arrests in widening Ghosn p...","Hi, what are you looking for?\nBy\nPublished\n...",general
3,Pope's bodyguards criticised over slapping inc...,"Hi, what are you looking for?\nBy\nPublished\n...",general
4,Lebanon denies president welcomed fugitive Ghosn,"Hi, what are you looking for?\nBy\nPublished\n...",general


### III. Prepare dataset for BERT MLM training 

In [12]:
mlm_df = df[['title', 'content']]

In [13]:
mlm_df.head()

Unnamed: 0,title,content
0,Looking into the truth about modern workplace ...,"Hi, what are you looking for?\nBy\nPublished\n..."
1,Hexo refiles financial statements,"New York reported a record 90,132 new Covid-19..."
2,"Japan raid, Turkey arrests in widening Ghosn p...","Hi, what are you looking for?\nBy\nPublished\n..."
3,Pope's bodyguards criticised over slapping inc...,"Hi, what are you looking for?\nBy\nPublished\n..."
4,Lebanon denies president welcomed fugitive Ghosn,"Hi, what are you looking for?\nBy\nPublished\n..."


In [14]:
def save_as_txt(df: DataFrame) -> None:
    fout = open('./data/covid_articles.txt', 'w')
    for title, content in zip(df.title.values, df.content.values):
        fout.write('\n'.join([title, content]))
    fout.close()

In [15]:
%%time

save_as_txt(df)

CPU times: user 9.67 s, sys: 1.27 s, total: 10.9 s
Wall time: 30.3 s


In [16]:
sleep(5)

##### Copy dataset from local to S3

In [17]:
%%time

s3.meta.client.upload_file('./data/covid_articles.txt', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles.txt')

CPU times: user 18.3 s, sys: 15.7 s, total: 34.1 s
Wall time: 12.7 s


### IV. Prepare dataset for sequence classification 

In [18]:
clf_df = df.copy()
clf_df.dropna(inplace=True)

In [19]:
def combine(title: str, content: str) -> str:
    return '\n'.join([title, content])

In [20]:
clf_df['article'] = clf_df.apply(lambda x: combine(x['title'], x['content']), axis=1)

In [21]:
clf_df.drop(['title', 'content'], axis=1, inplace=True)

In [22]:
clf_df.head()

Unnamed: 0,category,article
0,general,Looking into the truth about modern workplace ...
1,general,Hexo refiles financial statements\nNew York re...
2,general,"Japan raid, Turkey arrests in widening Ghosn p..."
3,general,Pope's bodyguards criticised over slapping inc...
4,general,Lebanon denies president welcomed fugitive Gho...


In [23]:
assert len(set(clf_df.category.values)) == 5

##### Label encode `category` column

In [24]:
label_encoder = LabelEncoder()
clf_df['category'] = label_encoder.fit_transform(clf_df['category'])

##### Get label mapping

In [25]:
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_map

{'business': 0, 'esg': 1, 'general': 2, 'science': 3, 'tech': 4}

##### Copy dataset from local to S3 

In [26]:
%%time 

clf_df.to_csv('./data/covid_articles_clf_data.csv', index=False, header=False)

CPU times: user 49.4 s, sys: 975 ms, total: 50.3 s
Wall time: 1min 14s


In [27]:
sleep(5)

In [28]:
%%time 

s3.meta.client.upload_file('./data/covid_articles_clf_data.csv', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles_clf_data.csv')

CPU times: user 18.5 s, sys: 17.7 s, total: 36.3 s
Wall time: 13.9 s
