#### Prerequisites

* Choose `Switch instance type` above
* Toggle `Fast launch only` to select more types of instances
* Change instance type to `ml.m5.2xlarge`
* For Kernel, choose `Python 3 (Data Science)`

In [1]:
%%capture 

!pip install pandas==1.0.1
!pip install sklearn==0.22.1
!pip install boto3==1.24.12
!pip install sagemaker==2.100.0

### I. Imports 

In [2]:
from sklearn.preprocessing import LabelEncoder
from sagemaker import Session
from pandas import DataFrame
from time import sleep
import pandas as pd
import sagemaker
import sklearn
import logging
import pickle
import boto3

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies

In [4]:
logger.info(f'[Using SageMaker version: {sagemaker.__version__}]')
logger.info(f'[Using Sklearn version: {sklearn.__version__}]')
logger.info(f'[Using Boto3 version: {boto3.__version__}]')
logger.info(f'[Using Pandas version: {pd.__version__}]')

[Using SageMaker version: 2.100.0]
[Using Sklearn version: 0.22.1]
[Using Boto3 version: 1.24.12]
[Using Pandas version: 1.0.1]


##### Essentials

In [5]:
session = Session()
s3 = boto3.resource('s3')

S3_BUCKET = session.default_bucket()
logger.info(f'S3 bucket = {S3_BUCKET}')
S3_DATA_FOLDER = 'data'

S3 bucket = sagemaker-us-east-1-119174016168


### II. Load and prepare data 

In [6]:
RAW_INPUT_DATA_S3_LOCATION = 's3://sagemaker-us-east-1-119174016168/datasets/covid19_articles_20220420.csv'

In [7]:
%%time

df = pd.read_csv(RAW_INPUT_DATA_S3_LOCATION)
df.shape

CPU times: user 31.8 s, sys: 7.92 s, total: 39.7 s
Wall time: 1min 51s


(477551, 10)

In [8]:
df.drop(df.filter(regex='Unnamed'), axis=1, inplace=True)
df.drop(['author', 'date', 'domain', 'companies', 'locations', 'sdgs'], axis=1, inplace=True)

In [9]:
def clean(article: list) -> str:
    return '\n'.join(eval(article))

In [10]:
df['content'] = df['content'].apply(clean)
df.rename(columns={'datatype': 'category'}, inplace=True)
df.head()

Unnamed: 0,title,content,category
0,Looking into the truth about modern workplace ...,"Hi, what are you looking for?\nBy\nPublished\n...",general
1,Hexo refiles financial statements,"New York reported a record 90,132 new Covid-19...",general
2,"Japan raid, Turkey arrests in widening Ghosn p...","Hi, what are you looking for?\nBy\nPublished\n...",general
3,Pope's bodyguards criticised over slapping inc...,"Hi, what are you looking for?\nBy\nPublished\n...",general
4,Lebanon denies president welcomed fugitive Ghosn,"Hi, what are you looking for?\nBy\nPublished\n...",general


### III. Prepare dataset for BERT MLM training 

In [11]:
mlm_df = df[['title', 'content']]
mlm_df.head()

Unnamed: 0,title,content
0,Looking into the truth about modern workplace ...,"Hi, what are you looking for?\nBy\nPublished\n..."
1,Hexo refiles financial statements,"New York reported a record 90,132 new Covid-19..."
2,"Japan raid, Turkey arrests in widening Ghosn p...","Hi, what are you looking for?\nBy\nPublished\n..."
3,Pope's bodyguards criticised over slapping inc...,"Hi, what are you looking for?\nBy\nPublished\n..."
4,Lebanon denies president welcomed fugitive Ghosn,"Hi, what are you looking for?\nBy\nPublished\n..."


In [12]:
def save_as_txt(df: DataFrame) -> None:
    fout = open('.././data/covid_articles.txt', 'w', encoding='utf-8')
    for title, content in zip(df.title.values, df.content.values):
        fout.write('\n'.join([title, content]))
    fout.close()

In [13]:
%%time

save_as_txt(df)

CPU times: user 9.52 s, sys: 1.15 s, total: 10.7 s
Wall time: 32.9 s


In [14]:
sleep(5)  # Allow time for save to S3

##### Copy dataset from local to S3

In [15]:
%%time

s3.meta.client.upload_file('.././data/covid_articles.txt', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles.txt')

CPU times: user 19.3 s, sys: 17.4 s, total: 36.6 s
Wall time: 18.1 s


### IV. Prepare dataset for sequence classification 

In [16]:
clf_df = df.copy()
clf_df.dropna(inplace=True)

In [17]:
def combine(title: str, content: str) -> str:
    return '\n'.join([title, content])

In [18]:
clf_df['article'] = clf_df.apply(lambda x: combine(x['title'], x['content']), axis=1)
clf_df.head()

Unnamed: 0,title,content,category,article
0,Looking into the truth about modern workplace ...,"Hi, what are you looking for?\nBy\nPublished\n...",general,Looking into the truth about modern workplace ...
1,Hexo refiles financial statements,"New York reported a record 90,132 new Covid-19...",general,Hexo refiles financial statements\nNew York re...
2,"Japan raid, Turkey arrests in widening Ghosn p...","Hi, what are you looking for?\nBy\nPublished\n...",general,"Japan raid, Turkey arrests in widening Ghosn p..."
3,Pope's bodyguards criticised over slapping inc...,"Hi, what are you looking for?\nBy\nPublished\n...",general,Pope's bodyguards criticised over slapping inc...
4,Lebanon denies president welcomed fugitive Ghosn,"Hi, what are you looking for?\nBy\nPublished\n...",general,Lebanon denies president welcomed fugitive Gho...


In [19]:
assert len(set(clf_df.category.values)) == 5

##### Label encode `category` column

In [20]:
label_encoder = LabelEncoder()
clf_df['category'] = label_encoder.fit_transform(clf_df['category'])

##### Get label mapping

In [21]:
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_map

{'business': 0, 'esg': 1, 'general': 2, 'science': 3, 'tech': 4}

##### Save label mapping to be used during inference

In [22]:
with open('.././data/label_map', 'wb') as f:
     pickle.dump(label_map, f, protocol=pickle.HIGHEST_PROTOCOL)

##### Copy dataset from local to S3 

In [23]:
%%time 

clf_df.to_csv('.././data/covid_articles_clf_data.csv',  encoding='utf-8', index=False, header=False)

CPU times: user 1min 37s, sys: 1.64 s, total: 1min 39s
Wall time: 2min 27s


In [24]:
sleep(5)  # Allow time for save to S3

In [25]:
%%time 

s3.meta.client.upload_file('.././data/covid_articles_clf_data.csv', S3_BUCKET, f'{S3_DATA_FOLDER}/covid_articles_clf_data.csv')

CPU times: user 38.2 s, sys: 34.6 s, total: 1min 12s
Wall time: 30.9 s


#### Clean up local copies of data 

In [26]:
! rm .././data/covid_articles.txt

In [27]:
! rm .././data/covid_articles_clf_data.csv