In [2]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [3]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [4]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [9]:
# Logistic and XGboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score
from xgboost import XGBClassifier
import pickle

In [10]:
# LSTM 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# BERT models
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import AutoModel, BertTokenizerFast

In [6]:
# !pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.10.1


In [6]:
from imblearn.over_sampling import RandomOverSampler

In [77]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
df = pd.read_json(r"../../../data/processed/pre-processed_0_2023_02_16.json")

In [17]:
df['article'].value_counts()

article6(1)(b)    4859
article6(2)        339
article8(2)         95
article8(1)         37
referral             1
Name: article, dtype: int64

In [18]:
df=df[~df['article_txt'].isin(["referral","article8(3)"])]

In [20]:
df['article_txt'].value_counts()

article6(1)(b)    4858
article6(2)        340
article8(2)         96
article8(1)         37
Name: article_txt, dtype: int64

In [21]:
label_map = {
    'article6(1)(b)': 0,
    'article8(1)': 0,
    'article6(2)': 1,
    'article8(2)': 1
}

In [23]:
df['label'] = df['article_txt'].map(label_map)

In [24]:
df['label'].value_counts()

0    4895
1     436
Name: label, dtype: int64

### tfidf

In [25]:
vectorizer = TfidfVectorizer()
dfm = vectorizer.fit_transform(df['text_clean'])
dfm.shape

#x documents with y features or words.

(5331, 147350)

### word embeddings

In [26]:
dims = 100

z = ZipFile(r"../../../data/glove.6B.zip") # glove zip file saved in data folder
f = z.open(f'glove.6B.{dims}d.txt')

embed_matrix = pd.read_table(
    f, sep = " ", index_col = 0, 
    header = None, quoting = csv.QUOTE_NONE
)

In [27]:
embed_matrix.shape

(400000, 100)

In [28]:
embed_matrix

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
the,-0.038194,-0.244870,0.728120,-0.399610,0.083172,0.043953,-0.391410,0.334400,-0.57545,0.087459,...,0.016215,-0.017099,-0.389840,0.87424,-0.725690,-0.510580,-0.520280,-0.145900,0.82780,0.270620
",",-0.107670,0.110530,0.598120,-0.543610,0.673960,0.106630,0.038867,0.354810,0.06351,-0.094189,...,0.349510,-0.722600,0.375490,0.44410,-0.990590,0.612140,-0.351110,-0.831550,0.45293,0.082577
.,-0.339790,0.209410,0.463480,-0.647920,-0.383770,0.038034,0.171270,0.159780,0.46619,-0.019169,...,-0.063351,-0.674120,-0.068895,0.53604,-0.877730,0.318020,-0.392420,-0.233940,0.47298,-0.028803
of,-0.152900,-0.242790,0.898370,0.169960,0.535160,0.487840,-0.588260,-0.179820,-1.35810,0.425410,...,0.187120,-0.018488,-0.267570,0.72700,-0.593630,-0.348390,-0.560940,-0.591000,1.00390,0.206640
to,-0.189700,0.050024,0.190840,-0.049184,-0.089737,0.210060,-0.549520,0.098377,-0.20135,0.342410,...,-0.131340,0.058617,-0.318690,-0.61419,-0.623930,-0.415480,-0.038175,-0.398040,0.47647,-0.159830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,-0.155770,-0.049188,-0.064377,0.223600,-0.201460,-0.038963,0.129710,-0.294510,0.00359,-0.098377,...,0.093324,0.094486,-0.023469,-0.48099,0.623320,0.024318,-0.275870,0.075044,-0.56380,0.145010
kronik,-0.094426,0.147250,-0.157390,0.071966,-0.298450,0.039432,0.021870,0.008041,-0.18682,-0.311010,...,-0.305450,-0.011082,0.118550,-0.11312,0.339510,-0.224490,0.257430,0.631430,-0.20090,-0.105420
rolonda,0.360880,-0.169190,-0.327040,0.098332,-0.429700,-0.188740,0.455560,0.285290,0.30340,-0.366830,...,-0.044082,0.140030,0.300070,-0.12731,-0.143040,-0.069396,0.281600,0.271390,-0.29188,0.161090
zsombor,-0.104610,-0.504700,-0.493310,0.135160,-0.363710,-0.447500,0.184290,-0.056510,0.40474,-0.725830,...,0.151530,-0.108420,0.340640,-0.40916,-0.081263,0.095315,0.150180,0.425270,-0.51250,-0.170540


In [29]:
#find the words in our corpus that are also present in the GloVe embedding matrix
common_features = set(embed_matrix.index) & set(vectorizer.get_feature_names_out())
len(common_features)

40985

In [30]:
vocab_ids = [vectorizer.vocabulary_[x] for x in common_features]
vocab_ids[1:10]

[104850, 126837, 10114, 26777, 36999, 91334, 132336, 117559, 33535]

In [31]:
# [5331,40985] * [40895, 100]

doc_matrix = dfm[:,vocab_ids].dot(embed_matrix.loc[common_features,])
doc_matrix.shape

  doc_matrix = dfm[:,vocab_ids].dot(embed_matrix.loc[common_features,])


(5331, 100)

## Model training

### 1. Logistic Regression

In [32]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [33]:
# split data into train and test sets
l_train_text, l_test_text, l_train_labels, l_test_labels = train_test_split(doc_matrix, df['label'], 
                                                                            random_state=2018, 
                                                                            test_size=0.3, 
                                                                            stratify=df['label'])

In [53]:
clf = LogisticRegression(random_state=0).fit(l_train_text, l_train_labels)
#with open(r"../../../models/logistic.pkl", 'wb') as f: pickle.dump(clf, f)

y_pred = clf.predict(l_test_text)

accuracy = accuracy_score(l_test_labels, y_pred) *100.0
precision = precision_score(l_test_labels, y_pred, average='binary')
recall = recall_score(l_test_labels, y_pred, average='binary')
f_score = 2 * (precision * recall) / (precision + recall)

print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')

 Accuracy: 98.00 
 Precision: 0.890 
 Recall: 0.863 
 F1: 0.876


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
# get the feature importance values
importance = clf.coef_[0]

# sort the feature importance values in descending order
sorted_importance = sorted(zip(importance, l_train_text), reverse=True)

# print the feature importance values
for i, (val, name) in enumerate(sorted_importance):
    print("{}. Feature: {}, Importance: {}".format(i+1, name, val))

1. Feature: [ 0.56346     1.02700534  0.83073123 -0.43615468  1.03870261 -2.91677455
 -0.37769782  0.32397277 -0.29826284  0.45858054  0.8984505  -0.33538713
  0.33802868  0.28359225 -0.40778614 -1.36924804  0.42953683  0.06588722
  0.41039499 -0.256457    0.71513023 -0.46184979  0.09622329  0.92254516
 -0.00774859 -0.18073903 -0.24868027  0.77539153  0.18556828  0.59372564
 -1.45522996  1.49469986 -0.3063475  -0.7116052   0.07451022  0.53963193
  1.06139956  0.22822581 -0.37230214 -0.72818526  0.33187413 -0.80733239
  0.20907764  0.01959909  1.00730813  0.48575712 -0.37938205 -0.46896516
  0.08596457 -0.12848248  0.05654974  1.13421558 -0.08489916  1.35509026
 -1.07889998 -2.76411647 -0.21881112 -0.78419872  3.16830591  0.61934656
 -0.30336589 -0.98525749 -0.32605984 -0.01019804  0.60837663  0.59214552
  0.16609606  0.83363108  0.48374856 -1.095747   -0.17910924 -0.58443734
  0.41637816 -0.48048967 -0.04661575 -0.14194779  0.41595167 -0.9275774
 -2.19619173  0.41112021  1.09722172 -0.

### 2. Gradient Boosting

In [35]:
bst = XGBClassifier(n_estimators=1000, max_depth=1000, learning_rate=0.1, objective='binary:logistic')

bst.fit(l_train_text, l_train_labels)

print(bst)

y_pred = bst.predict(l_test_text)

accuracy = accuracy_score(l_test_labels, y_pred) * 100.0
precision = precision_score(l_test_labels, y_pred, average='binary')
recall = recall_score(l_test_labels, y_pred, average='binary')
f_score = 2 * (precision * recall) / (precision + recall)

print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=1000, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)
 Accuracy: 97.62 
 Precision: 0.919 
 Recall: 0.779 
 F1: 0.843


## Imbalanced Learn

In [38]:
from collections import Counter

In [39]:
Counter(l_train_labels)

Counter({0: 3426, 1: 305})

In [40]:
ROS = RandomOverSampler(sampling_strategy=1)

In [None]:
l_train_text, l_test_text, l_train_labels, l_test_labels

In [44]:
# l_train_text_tf = vectorizer.transform(l_train_text)
# l_train_text_tf = l_train_text_tf.toarray()

# l_test_text_tf = vectorizer.transform(l_test_text)
# l_test_text_tf = l_test_text_tf.toarray()

In [43]:
l_train_text

array([[ 0.21391065,  0.96788652,  0.92497855, ..., -0.26025425,
         1.57922422, -0.17917874],
       [-0.24159557,  1.24156423,  1.49015532, ..., -1.27987056,
         3.49364323,  0.03209406],
       [ 0.13618607,  0.50704045,  1.34990105, ..., -0.60898837,
         1.22487523, -0.45934322],
       ...,
       [ 0.09957862, -0.28805078,  0.42906287, ..., -1.41800151,
         2.83993924, -0.13620405],
       [ 0.72916986, -0.33387755,  0.852013  , ..., -0.22509016,
         1.9554965 , -0.09345603],
       [-0.09941655, -0.07453666, -0.10440855, ..., -0.71752017,
         0.84283035, -0.6060511 ]])

In [45]:
l_train_text_ros, l_train_labels_ros = ROS.fit_resample(l_train_text, l_train_labels)

In [46]:
Counter(l_train_labels_ros)

Counter({0: 3426, 1: 3426})

In [80]:
clf = LogisticRegression(random_state=0).fit(l_train_text_ros, l_train_labels_ros)
with open(r"../../../models/logistic_imb.pkl", 'wb') as f: pickle.dump(clf, f)

y_pred = clf.predict(l_test_text)

accuracy = accuracy_score(l_test_labels, y_pred) *100.0
precision = precision_score(l_test_labels, y_pred, average='binary')
recall = recall_score(l_test_labels, y_pred, average='binary')
f_score = 2 * (precision * recall) / (precision + recall)

print(f' Accuracy: {accuracy:.2f} \n Precision: {precision:.3f} \n Recall: {recall:.3f} \n F1: {f_score:.3f}')

 Accuracy: 97.06 
 Precision: 0.762 
 Recall: 0.931 
 F1: 0.838


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Long Short-Term Memory Network

## BERT

In [63]:

# split data into train and test sets
train_text, test_text, train_labels, test_labels = train_test_split(df['text_clean'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])

In [64]:
# create train and test dataset 
train_dataset = list(zip(train_labels, train_text))
test_dataset = list(zip(test_labels, test_text))

# convert pd series to list
train_text = train_text.tolist()
test_text = test_text.tolist()

In [60]:
# bert-base-uncased
bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', return_dict=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
# legal-bert-base-uncased
bert = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased", return_dict=False)
tokenizer = BertTokenizerFast.from_pretrained("nlpaueb/legal-bert-base-uncased", return_dict=False)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [96]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

In [97]:
# convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [98]:
batch_size = 2
num_workers = 2

# dataLoader for train set
train_data = TensorDataset(train_seq, train_mask, train_y)
train_dataloader = DataLoader(train_data, num_workers=num_workers, shuffle=True, batch_size=batch_size)

# dataLoader for test set
test_data = TensorDataset(test_seq, test_mask, test_y)
test_dataloader = DataLoader(test_data, num_workers=num_workers, shuffle=True, batch_size=batch_size)

In [99]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)

      return x

In [100]:
# method to freeze all the parameters if freeze = T
def set_parameter_requires_grad(model, freeze):
    if freeze:
        for param in model.parameters():
            param.requires_grad = False

In [101]:
set_parameter_requires_grad(model=bert, freeze=True)
bert_classifier = BERT_Arch(bert)
bert_classifier = bert_classifier.to(device)

In [102]:
def train(model, dataloader, criterion, optimizer):
  model.train()
  total_loss = 0
  total_preds = []
  total_labels = []
  
  for inputs in tqdm(dataloader):
    
    # push to gpu
    inputs = [r.to(device) for r in inputs]
    sent_id, mask, labels = inputs

    # zero the parameter gradients
    model.zero_grad()        

    # forward + backward + optimize 
    preds = model(sent_id, mask)
    loss = criterion(preds, labels)
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #prevent exploding gradient problem
    optimizer.step()

    total_labels.append(labels)
    total_preds.append(preds.argmax(dim=-1))

  total_labels = torch.cat(total_labels)
  total_preds = torch.cat(total_preds)
  
  # epoch loss and accuracy
  epoch_loss = total_loss / len(dataloader)
  epoch_acc = accuracy_score(total_labels.detach().numpy(), total_preds.detach().numpy())

  return epoch_loss, epoch_acc

In [103]:
def evaluate(model, dataloader, criterion):
  model.eval()
  total_loss = 0
  total_preds = []
  total_labels = []

  for inputs in tqdm(dataloader):
    
    # push to gpu
    inputs = [t.to(device) for t in inputs]
    sent_id, mask, labels = inputs

    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = criterion(preds,labels)
      total_loss += loss.item()

      total_labels.append(labels)
      total_preds.append(preds.argmax(dim=-1))
  
  total_labels = torch.cat(total_labels)
  total_preds = torch.cat(total_preds)

  # epoch loss and model predictions
  epoch_loss = total_loss / len(dataloader)
  epoch_acc = accuracy_score(total_labels.detach().numpy(), total_preds.detach().numpy())

  return epoch_loss, epoch_acc

In [104]:
def fit(model, criterion, train_loader, val_loader, epochs):
    best_valid_loss = float('inf')

    train_losses=[]
    valid_losses=[]

    for epoch in range(epochs):
        
        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
        train_loss, train_acc = train(model, train_loader, criterion, optimizer)
        valid_loss, valid_acc = evaluate(model, val_loader, criterion)
        
        # save best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), r"../../../models//bert_saved_weights.pt")
        
        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        
        print(f"Train Loss: {train_loss:.2f}")
        print(f"Validation Loss: {valid_loss:.2f}")
        print(f"Train Accuracy: {train_acc:.3f}")
        print(f"Validation Accuracy: {valid_acc:.3f}")

In [107]:
# # Data imbalance
# from sklearn.utils.class_weight import compute_class_weight

# #compute the class weights
# class_weights = compute_class_weight('balanced', np.unique(df.label), train_labels)

# print("Class Weights:",class_weights)

TypeError: compute_class_weight() takes 1 positional argument but 3 were given

In [None]:
# # converting list of class weights to a tensor
# weights= torch.tensor(class_weights,dtype=torch.float)

# # push to GPU
# weights = weights.to(device)

In [108]:
from transformers import AdamW

epochs = 20
learning_rate = 1e-5

optimizer = AdamW(bert_classifier.parameters(), lr = learning_rate)
criterion  = nn.NLLLoss() #weight=weights



In [109]:
fit(bert_classifier, criterion, train_dataloader, test_dataloader, epochs)


 Epoch 1 / 20


100%|██████████| 1866/1866 [28:00<00:00,  1.11it/s]
100%|██████████| 800/800 [09:36<00:00,  1.39it/s]


Train Loss: 0.38
Validation Loss: 0.38
Train Accuracy: 0.916
Validation Accuracy: 0.918

 Epoch 2 / 20


 20%|██        | 378/1866 [05:26<21:24,  1.16it/s]


KeyboardInterrupt: 