In [1]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import Dataset
import torch
import os
import json
import re
import emoji
from tqdm import tqdm
tqdm.pandas()
from transformers import Trainer, TrainingArguments
import numpy as np
from datasets import load_metric
from sklearn.model_selection import train_test_split
import ast

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [None]:
def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

def removePattern(text, pattern):

    r = re.findall(pattern, text)

    for i in r:

        text = re.sub(i, '', text)

    return text

def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def clean_text(text):
    if type(text)!=str:
        return text
    text = text.lower()  # lower case
    text = remove_urls(text)
    text = cleanhtml(text)
    text = removePattern(text, "@[\w]*")  # remove handles
    text = removePattern(text, "&[\w]*")  # remove &amp
    # remove special characters, punctuations
    text = re.sub('[!@$:);/#,.*$?।&"]', '', text)
    text = emoji.get_emoji_regexp().sub(u'', text)  # remove emoji
    return text

In [None]:
def encode_label(x):
  labels=['First Party Collection/Use',
          'Third Party Sharing/Collection',
          'Other',
          'International and Specific Audiences',
          'Data Security',
          'User Choice/Control',
          'User Access, Edit and Deletion',
          'Data Retention',
          'Policy Change',
          'Do Not Track']
  return labels.index(x)

In [None]:
class PrivacyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        self.df["text"] = self.df["text"].progress_apply(lambda x:clean_text(x))
        self.df["label"] = self.df["data_practice"].progress_apply(lambda x:encode_label(x))
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        model_input = self.df['text'][idx]            
        encoded_sent = self.tokenizer.encode_plus(
            text=model_input, 
            add_special_tokens=True,       
            max_length=512,                  
            padding='max_length',          
            return_attention_mask=True, 
            truncation=True
            )
        
        input_ids = encoded_sent.get('input_ids')
        attention_mask = encoded_sent.get('attention_mask')
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)        

        label = torch.tensor(self.df['label'][idx])
        
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
train_dataset = PrivacyDataset(pd.read_csv("../datasets/OPP/train.csv"), tokenizer)
test_dataset = PrivacyDataset(pd.read_csv("../datasets/OPP/val.csv"), tokenizer)

In [None]:
metric1 = load_metric("accuracy")
metric2 = load_metric("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels)
    return {'accuracy': accuracy["accuracy"], 'f1-score': f1["f1"]}

In [None]:
training_args = TrainingArguments(
    output_dir='/scratch/arjunth2001/opp_results',          # output directory
    num_train_epochs=15,            # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # strength of weight decay
    logging_dir='/scratch/arjunth2001/opp_logs',           # directory for storing logs
    evaluation_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    save_total_limit = 1,
    learning_rate = 0.00001,
    load_best_model_at_end=True,
    metric_for_best_model ="eval_f1-score",
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./model_opp115")