## Setup

In [1]:
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install -q persian_wordcloud
!pip install -q transformers

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from cleantext import clean
import hazm
import os
import re
import json
from persian_wordcloud.wordcloud import PersianWordCloud, add_stop_words
from IPython.display import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

### Loading data

In [4]:
df = pd.read_csv('../input/sanppfood-sentiment-analysis/train.csv', encoding='utf-8', sep='\t')
df = df[['comment', 'label', 'label_id']]

In [7]:
df_val = pd.read_csv('../input/sanppfood-sentiment-analysis/dev.csv', encoding='utf-8', sep='\t')
df_val = df_val[['comment', 'label', 'label_id']]

In [8]:
df_test = pd.read_csv('../input/sanppfood-sentiment-analysis/test.csv', encoding='utf-8', sep='\t')
df_test = df_test[['comment', 'label', 'label_id']]

In [9]:
df.head()

In [10]:
df.shape

## Preprocessing

In [11]:
# Checking information and missing values
df.info()

## Data Cleaning

In [13]:
def clean_html(s):
    return re.sub(re.compile('<.*?>'), '', s)
    
def text_preprocessing(s):
  s = s.strip()
  s = clean(s,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=False,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="0",
    replace_with_currency_symbol="",
)
  
  s = clean_html(s)

  normalizer = hazm.Normalizer()
  s = normalizer.normalize(s)

  extra_characters = re.compile("["
    u"\U0001F600-\U0001F64F"  
    u"\U0001F300-\U0001F5FF"  
    u"\U0001F680-\U0001F6FF"  
    u"\U0001F1E0-\U0001F1FF"  
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u'\U00010000-\U0010ffff'
    u"\u200d"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\u3030"
    u"\ufe0f"
    u"\u2069"
    u"\u2066"
    u"\u2068"
    u"\u2067"
    "]+", flags=re.UNICODE)

  s = extra_characters.sub(r'', s)
  s = re.sub("\s+", " ", s)         # trailing whitespace
  s = re.sub(r'(@.*?)[\s]', ' ', s) # @ mentions
  s = re.sub("#", "", s)            # hashtags

  return s


In [14]:
df['cleaned_comment'] = df['comment'].apply(text_preprocessing)

In [15]:
df_val['cleaned_comment'] = df_val['comment'].apply(text_preprocessing)

In [16]:
df_test['cleaned_comment'] = df_test['comment'].apply(text_preprocessing)

In [None]:
Image('result.png', width=500, height=500)

### Wordcloud for sad comments (label_id == 1)

In [19]:
# adding new coloumn to dataset - comment length by word
df['comment_length'] = df['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))

In [21]:
# creating a map list for labels and label_ids
all_labels = list(sorted(df['label'].unique()))
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

Continue with the model: LogisticRegression

In [28]:
id2label.values()

# ParsBert

In [33]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

## Config

In [34]:
# general config
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/drive/MyDrive/Assignments'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [35]:
# tokenizer config
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

### Choosing Max Sequence Length

In [36]:
token_lens = []

for txt in df.cleaned_comment:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  token_lens.append(len(tokens))

In [37]:
sns.displot(token_lens, kde=True, height=8, aspect=2)
plt.xlim([0, 256]);
plt.xlabel('Token count');

Most of the comments appear to be less than 100 tokens long, but we'll be careful and set a maximum length of 128.

In [38]:
MAX_LEN = 128

## Dataset preparation

In [39]:
class SanppfoodDataset(Dataset):

  def __init__(self, comments, targets, tokenizer, max_len):
    self.comments = comments
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.comments)
  
  def __getitem__(self, item):
    comment = self.comments[item]
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      comment,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=True,
      truncation=True,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'comment': comment,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'token_type_ids': encoding['token_type_ids'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [40]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SanppfoodDataset(
    comments=df.cleaned_comment.to_numpy(),
    targets=df.label_id.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size
  )

In [41]:
df.shape, df_val.shape, df_test.shape

In [42]:
# Creating data loaders
BATCH_SIZE = 8

train_data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [43]:
data_sample = next(iter(train_data_loader))
data_sample.keys()

In [44]:
print(data_sample['input_ids'].shape)
print(data_sample['attention_mask'].shape)
print(data_sample['targets'].shape)

## Building model

In [45]:
class SentimentModel(nn.Module):

    def __init__(self, config):
        super(SentimentModel, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH, return_dict=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids)
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits 

In [46]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
parsbert_model = None

In [47]:
parsbert_model = SentimentModel(config=config)
parsbert_model = parsbert_model.to(device)

In [48]:
input_ids = data_sample['input_ids'].to(device)
attention_mask = data_sample['attention_mask'].to(device)
token_type_ids = data_sample['token_type_ids'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length
print(token_type_ids.shape) # batch size x seq length

In [49]:
# Predicted probabilities from the trained model
F.softmax(parsbert_model(input_ids, attention_mask, token_type_ids), dim=1)

## Training phase

In [50]:
EPOCHS = 4

optimizer = AdamW(parsbert_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [51]:
# Train model for one epoch
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in tqdm(data_loader, total=len(data_loader), desc="Training... "):

    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    token_type_ids = d['token_type_ids'].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [52]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      token_type_ids = d['token_type_ids'].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [53]:
from collections import defaultdict
from tqdm.notebook import tqdm

In [54]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    parsbert_model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    parsbert_model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(parsbert_model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [55]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

## Evaluation

In [56]:
test_acc, _ = eval_model(
  parsbert_model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()