<a href="https://colab.research.google.com/github/arashms/DL-project/blob/main/COVID_Regulations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install and Import libraries and define parameters

### Install & Import Libraries

In [46]:
!pip install transformers



In [47]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import csv 
import random
from transformers import BertTokenizer, RobertaTokenizer
from transformers import BertForNextSentencePrediction, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from scipy import stats
import time
import datetime
import pickle
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gc
import itertools
import nltk
import transformers
# nltk.download('punkt')

### Setting Parameters

In [48]:
data_folder = '/content/drive/MyDrive/DL-project/'
save_path = '/content/drive/MyDrive/DL-project/models/'

epochs = 1
batch_size = 16
learning_rate = 1e-6

validation_ratio = 0.2

max_length = 128
max_sentence_length = 64

### Setting device, random seed, and runtime parameters

In [49]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Cuda available: ",torch.cuda.is_available())
if torch.cuda.is_available():
    print("Current device: ",  torch.cuda.current_device())

seed = 204920
seed2 = 293652

random.seed(seed2)
np.random.seed(seed2)
torch.manual_seed(seed2)

if device.type == 'cuda':
    torch.cuda.manual_seed_all(seed)

Cuda available:  True
Current device:  0


# 2. Reading the Dataset

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
from sklearn.preprocessing import LabelEncoder as le
!cp  /content/drive/MyDrive/DL-project/Combined.xlsx Combined.xlsx
!cp  /content/drive/MyDrive/DL-project/acaps_covid19_government_measures_dataset_0.xlsx acaps_covid19_government_measures_dataset_0.xlsx

xl_file = pd.ExcelFile('acaps_covid19_government_measures_dataset_0.xlsx')

sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

dataframe = sheets['Dataset']

print('Sheets in the dataset:   ', sheets.keys())
print('Number of regulations in the dataset: ', len(dataframe), '\n')

print(dataframe.head())
print('\n', dataframe.info(), '\n')

# print(list(dataframe['ID'][0:10]))
# print(dataframe.iloc[0])

dataset = {}
for key in dataframe:
    # dataframe[key] = le.fit_transform(dataframe[key].astype(str))
    dataframe[key]=dataframe[key].astype('str')
    dataset[key] = list(dataframe[key])


Sheets in the dataset:    dict_keys(['About', 'Dataset', 'Dictionary'])
Number of regulations in the dataset:  23923 

     ID  ISO  ... ENTRY_DATE Alternative source
0  4245  AFG  ... 2020-04-07                NaN
1  4246  AFG  ... 2020-04-07                NaN
2  4247  AFG  ... 2020-04-07                NaN
3  4248  AFG  ... 2020-04-07                NaN
4    23  AFG  ... 2020-03-14                NaN

[5 rows x 18 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23923 entries, 0 to 23922
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  23923 non-null  int64         
 1   ISO                 23923 non-null  object        
 2   COUNTRY             23923 non-null  object        
 3   REGION              23923 non-null  object        
 4   ADMIN_LEVEL_NAME    3682 non-null   object        
 5   PCODE               0 non-null      float64       
 6   LOG_TYPE

# 3. Processing the dataset

### Some statistics of the dataset

In [52]:
def get_frequency_stat(attr):

    freq_dic = {}
    for value in dataset[attr]:
        if value in freq_dic:
            freq_dic[value] += 1
        else:
            freq_dic[value] = 1
    
    values = []
    freqs = []

    for key in freq_dic:
        values.append(key)
        freqs.append(freq_dic[key])

    freqs, values = (list(t) for t in zip(*sorted(zip(freqs, values))))
    freqs, values = freqs[::-1], values[::-1]
    sumf = sum(freqs)

    print('\nFrequency Stat of ', attr)
    print('Number of different values: ', len(values), '\n')

    for i in range(min(len(values), 10)):
        print(values[i], ': %', 100 * freqs[i] / sumf)


# print(dataframe['SOURCE'].value_counts())
for key in dataset:
    get_frequency_stat(key)



Frequency Stat of  ID
Number of different values:  23923 

9999 : % 0.00418007774944614
9998 : % 0.00418007774944614
9997 : % 0.00418007774944614
9996 : % 0.00418007774944614
9995 : % 0.00418007774944614
9994 : % 0.00418007774944614
9993 : % 0.00418007774944614
9992 : % 0.00418007774944614
9991 : % 0.00418007774944614
9990 : % 0.00418007774944614

Frequency Stat of  ISO
Number of different values:  193 

GBR : % 2.7379509258872217
AUS : % 2.3157630731931613
USA : % 2.03569786398027
PHL : % 1.910295531496886
DNK : % 1.5006479120511642
CAN : % 1.379425657317226
NZL : % 1.2665635580821804
LKA : % 1.2665635580821804
PRT : % 1.228942858337165
MYS : % 1.2164026250888267

Frequency Stat of  COUNTRY
Number of different values:  193 

United Kingdom : % 2.7379509258872217
Australia : % 2.3157630731931613
United States : % 2.03569786398027
Philippines : % 1.910295531496886
Denmark : % 1.5006479120511642
Canada : % 1.379425657317226
Sri Lanka : % 1.2665635580821804
New Zealand : % 1.266563558082

### Classifying regulations based on Measure and Category

In [53]:
regulation_types = {}

for i in range(len(dataset['ID'])):

    category = dataset['CATEGORY'][i]
    measure = dataset['MEASURE'][i]

    if category not in regulation_types:
        regulation_types[category] = {}
    
    if measure not in regulation_types[category]:
        regulation_types[category][measure] = []
    
    regulation_types[category][measure].append(i)

for category in regulation_types:

    print(len(regulation_types[category]))
    
    for measure in regulation_types[category]:
        print('CATEGORY: ', category, ' MEASURE: ', measure, ' %',
              100 * len(regulation_types[category][measure]) / len(dataset['ID']) )


12
CATEGORY:  Public health measures  MEASURE:  Awareness campaigns  % 3.260460644567989
CATEGORY:  Public health measures  MEASURE:  Health screenings in airports and border crossings  % 1.617690089035656
CATEGORY:  Public health measures  MEASURE:  Strengthening the public health system  % 7.733143836475358
CATEGORY:  Public health measures  MEASURE:  Isolation and quarantine policies  % 5.726706516741212
CATEGORY:  Public health measures  MEASURE:  Other public health measures enforced  % 3.7369895080048487
CATEGORY:  Public health measures  MEASURE:  General recommendations  % 3.9125527734815866
CATEGORY:  Public health measures  MEASURE:  Requirement to wear protective gear in public  % 2.98875559085399
CATEGORY:  Public health measures  MEASURE:  Testing policy  % 2.5038665719182376
CATEGORY:  Public health measures  MEASURE:  Amendments to funeral and burial regulations  % 0.568490573924675
CATEGORY:  Public health measures  MEASURE:  Mass population testing  % 0.589390962671905

# 4. Comment embeddings from DeBerta model

### Loading Comments

In [63]:
xl_file = pd.ExcelFile("acaps_covid19_government_measures_dataset_0.xlsx")

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}


dataset = dfs['Dataset']

# using ids list you can find the id of each comment
not_null_ids = list(dataset.loc[dataset['COMMENTS'].notna()]["ID"])

# the list contating all not nan comments
comments = list(dataset.loc[dataset['COMMENTS'].notna()]["COMMENTS"])

print(f'number of relugations that the their comment is not null {len(comments)}')

number of relugations that the their comment is not null 23799


### Tokenizing

In [55]:
tokenizer = transformers.DebertaTokenizer.from_pretrained('microsoft/deberta-base') 
max_length = 64
train_encodings = tokenizer(comments, add_special_tokens=True, return_token_type_ids=False, truncation=True, padding=True, max_length=max_length)

### Creating pytorch Dataset

In [56]:
class NSPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
#         self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         if self.labels != None:
#           item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        # return len(self.labels)
        return len(self.encodings['input_ids'])

In [57]:
train_dataset = NSPDataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

### Creating DeBerta Model

In [58]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = transformers.DebertaModel.from_pretrained('microsoft/deberta-base')


if torch.cuda.device_count() > 1:
#   print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = torch.nn.DataParallel(model)
    
model.to(device)
model.train()

DebertaModel(
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=0)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0): DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): Linear(in_features=768, out_features=2304, bias=False)
            (pos_dropout): StableDropout()
            (pos_proj): Linear(in_features=768, out_features=768, bias=False)
            (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
            (dropout): StableDropout()
          )
          (output): DebertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): StableDropout()
          )
        )
        (intermediate): DebertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bia

### Generating Comment Embeddings

In [59]:
all_cls = []

for iteration, batch in tqdm(enumerate(train_loader)):
    
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        
        last_hidden_state = model(input_ids, attention_mask=attention_mask).last_hidden_state
        cls_tokens = last_hidden_state[:,0,:].detach()
        
        all_cls.append(cls_tokens)

        
# out_cls is a matrix of size number_of_not_null_comments (23799) X size_of_hidden_state_of_BERT (768)
# In this matrix, for each comment we have an embedding vector.
# Use "ids" list to map each comment with its ids.
out_cls = torch.cat(all_cls, 0)

print("shape of output matrix :", out_cls.shape)

744it [01:48,  6.86it/s]

shape of output matrix : torch.Size([23799, 768])





# 5. Preparing Input and Output Features

### Creating feature maps

In [66]:
category_map = {}
measure_map = {}
log_type_map = {}

for i in range(len(dataset['ID'])):

    category = dataset['CATEGORY'][i]
    measure = dataset['MEASURE'][i]
    log_type =dataset['LOG_TYPE'][i]

    if category not in category_map:
        category_map[category] = len(category_map.keys())
    
    if measure not in measure_map:
        measure_map[measure] = len(measure_map.keys())

    if log_type not in log_type_map:
        log_type_map[log_type] = len(log_type_map.keys())


id2index_map = {}

for index, id in enumerate(dataset['ID']):
    id2index_map[id] = index

### Preparing input features

In [67]:
message_embeddings = out_cls.tolist()
categories = []
measures = []
log_types = []

for id in not_null_ids:
    index = id2index_map[id]

    category = category_map[dataset['CATEGORY'][index]]
    one_hot_category = [0 for i in range(len(category_map.keys()))]
    one_hot_category[category] = 1
    categories.append(one_hot_category)

    measure = measure_map[dataset['MEASURE'][index]]
    one_hot_measure = [0 for i in range(len(measure_map.keys()))]
    one_hot_measure[measure] = 1
    measures.append(one_hot_measure)

    log_type = log_type_map[dataset['LOG_TYPE'][index]]
    log_types.append(log_type)


### Reading combined dataset & Preparing output labels

In [68]:
xl_file = pd.ExcelFile('Combined.xlsx')

sheets = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

dataframe = sheets['Policy']

print('Sheets in the dataset:   ', sheets.keys())
print('Number of regulations in the dataset: ', len(dataframe), '\n')

print(dataframe.head())
# print('\n', dataframe.info(), '\n')

combined_dataset = {}
for key in dataframe:
    dataframe[key]=dataframe[key].astype('str')
    combined_dataset[key] = list(dataframe[key])

Sheets in the dataset:    dict_keys(['Cases', 'Policy', 'HyperParam'])
Number of regulations in the dataset:  23923 

      ID  ISO            COUNTRY  ... Label_delta Label_percent  Label_3class
0   1448  TUR             Turkey  ...         0.0           0.0          -1.0
1  15613  GNQ  Equatorial Guinea  ...         0.0           0.0          -1.0
2   9941  TJK         Tajikistan  ...         0.0           0.0          -1.0
3  12714  GNQ  Equatorial Guinea  ...         0.0           0.0          -1.0
4  12715  GNQ  Equatorial Guinea  ...         0.0           0.0          -1.0

[5 rows x 30 columns]


In [70]:
id2label = {}
for i in range(len(combined_dataset['ID'])):
    id2label[int(combined_dataset['ID'][i])] = combined_dataset['Label_percent'][i]

labels = []
for id in not_null_ids:
    if id2label[id] == '0' or id2label[id] == '1':
        labels.append(int(id2label[id]))
    else:
        labels.append(-1)
        # The label is NAN!

In [72]:
# ids: not_null_ids
# features: 
#   - message_embeddings
#   - categories
#   - measures
#   - log_types
# labels: 
#   - labels

print('Not null ids dimension: ', len(not_null_ids))
print('Log type dimension: ', len(log_types))
print('Category dimension: ', len(categories), len(categories[0]))
print('Measure dimension: ', len(measures), len(measures[0]))
print('Message Embedding dimension: ', len(message_embeddings), len(message_embeddings[0]))
print('Label dimension: ', len(labels))

Not null ids dimension:  23799
Log type dimension:  23799
Category dimension:  23799 6
Measure dimension:  23799 35
Message Embedding dimension:  23799 768
Label dimension:  23799
