#Importing Dependencies

In [None]:
import pandas as pd
import nltk
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk import PorterStemmer, WordNetLemmatizer
import re
import string

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np


import torch
from torch import optim
from torch import nn
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

from zipfile import ZipFile
import pickle
import time
import copy

from models import MAGNET

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
with ZipFile('./glove.6B.zip', 'r') as file:
   file.extractall(path='./')

#Function

In [None]:
#Build adjacency matrix based on Co-Occurencies label
def buildAdjacencyCOOC(data_label):
  adj = data_label.T.dot(data_label).astype('float')
  for i in range(len(adj)):
    adj[i] = adj[i] / adj[i,i]
  
  return torch.from_numpy(adj.astype('float32'))

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#Text cleaning function
def preprocessingText(text, stop=stop):
  text = text.lower() #text to lowercase
  text = re.sub(r'&lt;', '', text) #remove '&lt;' tag
  text = re.sub(r'<.*?>', '', text) #remove html
  text = re.sub(r'[0-9]+', '', text) #remove number
  text = " ".join([word for word in text.split() if word not in stop]) #remove stopwords
  text = re.sub(r'[^\w\s]', '', text) #remove punctiation
  text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
  for c in ['\r', '\n', '\t'] :
    text = re.sub(c, ' ', text) #replace newline and tab with tabs
  text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
  return text

#Load Word Representation Vector
def loadWRVModel(File):
    print("Loading Word Representation Vector Model")
    f = open(File,'r')
    WRVModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        try:
          wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        except:
          print(splitLines[1:])
          print(len(splitLines[1:]))
          break
        WRVModel[word] = wordEmbedding
    print(len(WRVModel)," words loaded!")
    return WRVModel


def check_accuracy(model, label_embedding, X, y):
  
  model.eval()

  with torch.no_grad():
    out = model(X, label_embedding)
    y_pred = torch.sigmoid(out.detach()).round().cpu()
    f1score = f1_score(y, y_pred, average='micro')
    hammingloss = hamming_loss(y, y_pred)
  
  return hammingloss, f1score

def train(model,
          X_train,
          X_test,
          label_embedding,
          y_train,
          y_test,
          total_epoch=250,
          batch_size=250,
          learning_rate=0.001,
          save_path='./model.pt',
          state=None):
  
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  train_data = DataLoader(dataset(X_train, y_train), batch_size=batch_size)
  X_test = X_test.to(device)

  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  criterion = nn.BCEWithLogitsLoss()

  label_embedding= label_embedding.to(device)

  if state:
    state = torch.load(state)
    model = model.load_state_dict(state['last_model'])
    optimizer = optimizer.load_state_dict(state['optimizer'])
  
  else:
    model = model.to(device)
    state = dict()
    state['microf1'] = []
    state['hammingloss'] = []
    state['val_hammingloss'] = []
    state['val_microf1'] = []
    state['epoch_time'] = []
    
  epoch = 1
  
  best_train = 0
  best_val = 0
  
  while epoch <= total_epoch:
    running_loss = 0
    y_pred = []
    epoch_time = 0
    model.train()
    for index, (X, y) in enumerate(train_data):
      
      t = time.time()

      #forward
      out = model(X.to(device), label_embedding)
      loss = criterion(out, y.to(device))

      #backward
      optimizer.zero_grad()
      loss.backward()
      clip_grad_norm_(model.parameters(), max_norm=10)

      #update
      optimizer.step()

      epoch_time += time.time() - t
      y_pred.append(torch.sigmoid(out.detach()).round().cpu())
      running_loss += loss.item()

    y_pred = torch.vstack(y_pred)
    f1score = f1_score(y_train, y_pred, average='micro')
    hammingloss = hamming_loss(y_train, y_pred)
    val_hamming, val_f1score = check_accuracy(model, label_embedding, X_test, y_test)

    state['microf1'].append(f1score)
    state['hammingloss'].append(hammingloss)
    state['val_microf1'].append(val_f1score)
    state['epoch_time'].append(epoch_time)
    state['val_hammingloss'].append(val_hamming)

    state['optimizer'] = optimizer.state_dict()
    state['last_model'] = model.state_dict()
    
    
    if(best_train < f1score):
      state['model_best_train'] = copy.deepcopy(model.state_dict())
      best_train = f1score
      state['best_train'] = best_train
    
    if(best_val < val_f1score):
      state['model_best_val'] = copy.deepcopy(model.state_dict())
      best_val = val_f1score
      state['best_val'] = best_val

    torch.save(state, save_path)
    print('epoch:{} loss:{:.5f} hamming_loss:{:.5f} micro_f1score:{:.5f} val_hamming_loss:{:.5f} val_micro_f1score:{:.5f}'.
          format(epoch, running_loss, hammingloss, f1score, val_hamming, val_f1score))
    epoch+=1

#Load Data 

## Load Raw Dataset (Reuters-21578)

In [None]:
data_train = pd.read_pickle('./train.pickle')
data_test = pd.read_pickle('./test.pickle')

text_train = data_train.text.values
text_test = data_test.text.values

y_train = torch.from_numpy(np.vstack(data_train.onehot_label.values)).float()
y_test = torch.from_numpy(np.vstack(data_test.onehot_label.values)).float()

print('Train label shape {}'.format(y_train.shape))
print('Test label shape {}'.format(y_test.shape))

Train label shape torch.Size([7769, 90])
Test label shape torch.Size([3019, 90])


#Load MultilabelBinarizer

In [None]:
with open('./multilabelbinarizer.pickle', 'rb') as file:
  mlb = pickle.load(file)

#Build Word Representation Vector Dictionary

In [None]:
WRVModel = loadWRVModel('./glove.6B.300d.txt')

Loading Word Representation Vector Model
400000  words loaded!


#Text Preprocessing

## Text Cleaning

In [None]:
preprocessed_text_train = [preprocessingText(text) for text in text_train]
preprocessed_text_test = [preprocessingText(text) for text in text_test]

print('BEFORE CLEANING: {}'.format(text_train[0]))
print('AFTER CLEANING: {}'.format(preprocessed_text_train[0]))

BEFORE CLEANING: BAHIA COCOA REVIEW
  Showers continued throughout the week in
  the Bahia cocoa zone, alleviating the drought since early
  January and improving prospects for the coming temporao,
  although normal humidity levels have not been restored,
  Comissaria Smith said in its weekly review.
      The dry period means the temporao will be late this year.
      Arrivals for the week ended February 22 were 155,221 bags
  of 60 kilos making a cumulative total for the season of 5.93
  mln against 5.81 at the same stage last year. Again it seems
  that cocoa delivered earlier on consignment was included in the
  arrivals figures.
      Comissaria Smith said there is still some doubt as to how
  much old crop cocoa is still available as harvesting has
  practically come to an end. With total Bahia crop estimates
  around 6.4 mln bags and sales standing at almost 6.2 mln there
  are a few hundred thousand bags still in the hands of farmers,
  middlemen, exporters and processors.
    

##Text to Sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_text_train)

sequences_text_train = tokenizer.texts_to_sequences(preprocessed_text_train)
sequences_text_test = tokenizer.texts_to_sequences(preprocessed_text_test)

X_train = torch.from_numpy(pad_sequences(sequences_text_train, maxlen=128))
X_test = torch.from_numpy(pad_sequences(sequences_text_test, maxlen=128))

## Build Embedding Matrix

In [None]:
VOCAB_SIZE = len(tokenizer.word_index) + 1
embedding_matrix = torch.zeros(VOCAB_SIZE, 300)

unk = 0
for i in range(1, VOCAB_SIZE):
  word = tokenizer.index_word[i]
  if word in WRVModel.keys():
    embedding_matrix[i] = torch.from_numpy(WRVModel[word]).float()
  else:
    unk +=1
print('VOCAB_SIZE : {}'.format(VOCAB_SIZE))
print('TOTAL OF UNKNOWN WORD : {}'.format(unk))

VOCAB_SIZE : 25306
TOTAL OF UNKNOWN WORD : 6568


#Preparing Graph Attention Networks Input

## Label Embedding

In [None]:
label_embedding = torch.zeros(90,300)

for index, label in enumerate(mlb.classes_):
  words = label.split('-')
  num_of_words = len(words)

  for sublabel in words:
    if sublabel in WRVModel.keys():
      label_embedding[index] +=  torch.from_numpy(WRVModel[sublabel])
  label_embedding[index] = label_embedding[index]/num_of_words

print(label_embedding)

tensor([[ 0.1796, -0.1051, -0.5564,  ..., -0.0633,  0.3732, -0.2873],
        [ 0.1101,  0.4061,  0.2036,  ..., -0.1957, -0.4627,  0.6931],
        [-0.3568, -0.1348,  0.0790,  ..., -0.0384,  0.2948,  0.1996],
        ...,
        [-0.1446,  0.0594, -0.1450,  ..., -0.0334,  0.1966,  0.4136],
        [-0.5990, -0.3234, -0.2749,  ...,  0.6343,  0.5300,  0.0299],
        [-0.4541, -0.1300, -0.5178,  ..., -1.1637, -0.2056, -0.3177]])


##Adjacency Matrix

In [None]:
adjacency = buildAdjacencyCOOC(y_train.numpy())
print(adjacency)

tensor([[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0012],
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0286],
        [0.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0952, 0.0476, 0.0000,  ..., 0.0000, 0.0000, 1.0000]])


#Preparing DataLoader and Model

## Dataset Class

In [None]:
class dataset(Dataset):
  def __init__(self, x, y):
    self.x  = x
    self.y = y

  def __len__(self):
    return len(self.x)
  
  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

##Initialize Model

In [None]:
model = MAGNET(300, 250, adjacency, embedding_matrix)

#Model Training

##Configure Save PATH

In [None]:
save_path = './train_result.pt'

## Train Model

In [None]:
train(model, X_train, X_test, label_embedding, y_train, y_test, save_path=save_path)