In [28]:
import argparse
from pathlib import Path

import torch

from tuwnlpie import logger
from tuwnlpie.milestone1.model import NBClassifier
from tuwnlpie.milestone1.utils import read_food_disease_csv, split_data

# from tuwnlpie.milestone2.model import TorchModel
from tuwnlpie.milestone2.utils import TorchTrainer, read_and_prepare_data, TorchDataset, split_data_set, \
                                        length_longest_sentence, encodeX

In [29]:
import os
print(os.getcwd())
os.chdir('/Users/holu/Documents/project-1div7/')

train_data = './data/crowd_truth_combined.csv'

feature_col = 'tokens_lemma' #['term1', 'term2', 'sentence', 'tokens', 'tokens_stem', 'tokens_lemma']
label_cols = ['is_cause', 'is_treat']

# readIn
print("## Reading in Data ##")
data_frame = read_and_prepare_data(train_data, shall_sdp=False)
data_frame = data_frame[['tokens_lemma', 'tokens', 'is_cause', 'is_treat']]



/Users/holu/Documents/project-1div7
## Reading in Data ##
['term1' 'term2' 'sentence' 'is_cause' 'is_treat' 'tokens' 'tokens_stem'
 'tokens_lemma']
## Finished reading and preparing data ##


In [37]:
import pandas as pd
from pathlib import Path
import nltk 
from nltk import RegexpTokenizer
import spacy
import networkx as nx
import os
print(os.getcwd())
os.chdir('/Users/holu/Documents/project-1div7/')

path = './data/crowd_truth_combined.csv'

usedcols = ['sentence', 'term1', 'term2', 'is_cause', 'is_treat']
df = pd.read_csv(
    path,
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols)

# Make case insensitive (no loss because emphasis on words does not play a role)
df['sentence'] = df['sentence'].map(lambda x: x.lower())
# Replace entities in sentence with placeholder tokens (may be useful for generalization when using n-grams)
df['sentence'] = df.apply(lambda x: x['sentence'].replace(x['term1'].lower(), 'TERMONE').replace('TERMONEs', 'TERMONE'), axis=1)
df['sentence'] = df.apply(lambda x: x['sentence'].replace(x['term2'].lower(), 'TERMTWO').replace('TERMTWOs', 'TERMTWO'), axis=1)

df = df[df['sentence'].apply(lambda x: 'TERMONE' in x and 'TERMTWO' in x)]

# Convert labels to right dtype
df['is_cause'] = df['is_cause'].astype(float).astype(int)
df['is_treat'] = df['is_treat'].astype(float).astype(int)

# Tokenize the sentences
tokenizer = RegexpTokenizer(r'\w+')
df['tokens'] = df['sentence'].apply(lambda x: tokenizer.tokenize(x))
# Remove stop words and tokens with length smaller than 2 (i.e. punctuations)
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token not in nltk.corpus.stopwords.words('english') and len(token) > 1])
# Perform stemming
porter = nltk.PorterStemmer()
df['tokens_stem'] = df['tokens'].apply(lambda x: [porter.stem(token) for token in x])

# Perform lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemma'] = df['tokens_stem'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])


nlp = spacy.load("en_core_web_sm")
doc = nlp(df['sentence'][0])
def shortest_dep_path(sentence):
    doc = nlp(sentence)
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((
                '{0}'.format(token.lemma_),
                '{0}'.format(child.lemma_)))
    graph = nx.Graph(edges)
    entity1 = 'TERMONE'
    entity2 = 'TERMTWO'
    try:
        return nx.shortest_path(graph, source=entity1, target=entity2)
    except:
        return []

def remove_stop_words(tokens):
    return [x for x in tokens if x not in nltk.corpus.stopwords.words('english') and len(x) > 1]


df['sdp_tokens_lemma'] = df['sentence'].apply(lambda x: remove_stop_words(shortest_dep_path(x)))


/Users/holu/Documents/project-1div7


In [40]:
# DEBUG
X_loaded = data_frame[['tokens_lemma', 'tokens']].iloc[0:5]
y_loaded = data_frame[['is_cause', 'is_treat']]
# print(X_loaded)
for sentence in X_loaded['tokens_lemma'].iloc[0:5]:
    print(sentence)

['limit', 'data', 'suggest', 'child', 'mental', 'retard', 'termon', 'associ', 'aggress', 'destruct', 'properti', 'termtwo']
['termon', 'associ', 'difficult', 'behavior', 'termtwo', 'often', 'focu', 'clinic', 'attent', 'primari', 'asd', 'diagnosi']
['term', 'termon', 'employ', 'indic', 'ataxia', 'due', 'termtwo']
['non', 'hereditari', 'caus', 'termon', 'includ', 'termtwo', 'paraneoplast', 'termon', 'high', 'altitud', 'cerebr', 'oedema', 'coeliac', 'diseas', 'normal', 'pressur', 'hydrocephalu', 'cerebel']
['disord', 'present', 'migratori', 'ture', 'termtwo', 'mani', 'featur', 'like', 'termon', 'skin', 'rash', 'gait', 'abnorm', 'skin', 'nodul']


In [48]:
# DEBUG
from keras.preprocessing.text import one_hot
from keras_preprocessing.sequence import pad_sequences

# X_loaded = df[['tokens_lemma', 'tokens']].iloc[0:1]

# for sentence in X_loaded['tokens_lemma']:
#     print(sentence)
# for sentence in X_loaded['tokens']:
#     print(sentence)

lemmas = ['limit', 'data', 'suggest', 'child', 'mental', 'retard', 'termon', 'associ', 'aggress', 'destruct', 'properti', 'termtwo']
tokens = ['limited', 'data', 'suggest', 'children', 'mental', 'retardation', 'TERMONE', 'associated', 'aggression', 'destruction', 'property', 'TERMTWO']

unique_words = set()
longest_sentence = 0
for sentence in tokens:
    current_sentence = 0
    for word in sentence:
        current_sentence += 1
        if word not in unique_words:
            unique_words.add(word)
        if current_sentence > longest_sentence:
            longest_sentence = current_sentence

X_tmp = []
for sentence in lemmas:
    sen_tmp = []
    for token in sentence:
        sen_tmp.append(one_hot(token, len(unique_words)))
    X_tmp.append(sen_tmp)

X_tmp = pad_sequences(X_tmp, longest_sentence, padding='post') 
# makes all sentences the same length by padding with preset value at the end
X_tmp

array([[[ 8],
        [ 4],
        [10],
        [ 4],
        [10],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[18],
        [17],
        [10],
        [17],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[17],
        [ 7],
        [ 1],
        [ 1],
        [21],
        [17],
        [10],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[17],
        [ 1],
        [ 4],
        [ 8],
        [18],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[10],
        [21],
        [16],
        [10],
        [17],
        [ 8],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[ 8],
        [21],
        [10],
        [17],
        [ 8],
        [18],
        [ 0],
        [ 0],
        [ 0],
        [ 0],
        [ 0]],

       [[10],
        [21],
        [ 8],
        [10],
        

In [None]:
print(data_frame.columns.values)
X_loaded = data_frame[['tokens_lemma', 'tokens']]
y_loaded = data_frame[['is_cause', 'is_treat']]

print("## Encoding ##")

In [24]:
X_loaded

Unnamed: 0,tokens_lemma,tokens
0,"[limit, data, suggest, child, mental, retard, ...","[limited, data, suggest, children, mental, ret..."
1,"[termon, associ, difficult, behavior, termtwo,...","[TERMONEs, associated, difficult, behaviors, T..."
2,"[term, termon, employ, indic, ataxia, due, ter...","[term, TERMONE, employed, indicate, ataxia, du..."
3,"[non, hereditari, caus, termon, includ, termtw...","[non, hereditary, causes, TERMONE, include, TE..."
4,"[disord, present, migratori, ture, termtwo, ma...","[disorder, present, migratory, ture, TERMTWO, ..."
...,...,...
7963,"[61, year, old, man, termon, pd, develop, sudd...","[61, year, old, man, TERMONE, pd, developed, s..."
7964,"[success, treatment, patient, termon, termtwo,...","[successful, treatment, patient, TERMONE, TERM..."
7965,"[five, 15, patient, receiv, termtwo, experi, t...","[five, 15, patients, receiving, TERMTWO, exper..."
7966,"[develop, antibodi, termtwo, seriou, complic, ...","[development, antibodies, TERMTWO, serious, co..."


In [15]:
X = X_loaded
y = y_loaded

In [None]:
['limit', 'data', 'suggest', 'child', 'mental', 'retard', 'termon', 'associ', 'aggress', 'destruct', 'properti', 'termtwo']
['limit', 'data', 'suggest', 'child', 'mental', 'retard', 'term_on', 'associ', 'aggress', 'destruct', 'properti', 'termtwo']
['termon', 'associ', 'difficult', 'behavior', 'termtwo', 'often', 'focu', 'clinic', 'attent', 'primari', 'asd', 'diagnosi']
['termon', 'associ', 'difficult', 'behavior', 'termtwo', 'often', 'focu', 'clinic', 'attent', 'primari', 'asd', 'diagnosi']
['term', 'termon', 'employ', 'indic', 'ataxia', 'due', 'termtwo']
['term', 'termon', 'employ', 'indic', 'ataxia', 'due', 'termtwo']

In [22]:
X = encodeX(X_loaded)


In [23]:
X_temp

array([[[10112],
        [ 8492],
        [ 6924],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 5046],
        [10226],
        [ 5531],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 7782],
        [ 5046],
        [ 1165],
        ...,
        [    0],
        [    0],
        [    0]],

       ...,

       [[ 9782],
        [ 5624],
        [  652],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 1678],
        [10165],
        [ 4983],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 6059],
        [ 4983],
        [ 5046],
        ...,
        [    0],
        [    0],
        [    0]]], dtype=int32)

In [17]:
X

array([[[10112],
        [ 8492],
        [ 6924],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 5046],
        [10226],
        [ 5531],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 7782],
        [ 5046],
        [ 1165],
        ...,
        [    0],
        [    0],
        [    0]],

       ...,

       [[ 9782],
        [ 5624],
        [  652],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 1678],
        [10165],
        [ 4983],
        ...,
        [    0],
        [    0],
        [    0]],

       [[ 6059],
        [ 4983],
        [ 5046],
        ...,
        [    0],
        [    0],
        [    0]]], dtype=int32)

# Encoding -> WORKING!!!

In [8]:
# X = X['tokens_lemma']

print(X.shape)
print("## Split ##")
# split
X_train, X_test, y_train, y_test = split_data_set(X, y,  size=0.8, random_state=1)
X_test, X_val, y_test, y_val = split_data_set(X_test, y_test, size=0.5, random_state=1)


print("## Creating Data-Loaders ##")
# Data Loaders
train_loader = TorchDataset(X_train,y_train)
val_loader = TorchDataset(X_val,y_val)

dataloaders = { 
    'train': train_loader.get_dataloader(batch_size=256, shuffle=True), 
    'val': val_loader.get_dataloader(batch_size=128, shuffle=False)
}


(7821, 110, 1)
## Split ##
## Creating Data-Loaders ##


In [9]:
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn, matmul
from torch.nn.functional import softmax
from torchmetrics import Accuracy

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_position_embedding import PositionEmbedding


from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger


class TorchModel(LightningModule):
    def __init__(self, learning_rate=1e-2) -> None:
        super().__init__()
        self.save_hyperparameters('learning_rate')

        self.wordEmbeddings = nn.Embedding(11212,110)
        self.positionEmbeddings = nn.Embedding(110,40)
        # self.positionEmbeddings = PositionEmbedding(num_embeddings=11212, embedding_dim=110, mode=PositionEmbedding.MODE_ADD)
        self.transformerLayer = nn.TransformerEncoderLayer(150,15) #this transofrmer contains muti head attention
        self.linear1 = nn.Linear(150, 64)
        self.linear2 = nn.Linear(64, 1)
        self.linear3 = nn.Linear(110,  16)
        self.linear4 = nn.Linear(16, 2)
           
    def forward(self, x):
        positions = (torch.arange(0,110).reshape(1,110) + torch.zeros(x.shape[0],110)).to(device)
        sentence = torch.cat((self.wordEmbeddings(x.long()).squeeze(2),self.positionEmbeddings(positions.long())),axis=2)
        attended = self.transformerLayer(sentence)
        linear1 = F.relu(self.linear1(attended))
        linear2 = torch.sigmoid(self.linear2(linear1))
        linear2 = linear2.view(-1,110) # reshaping the layer as the transformer outputs a 2d tensor (or 3d considering the batch size)
        linear3 = F.relu(self.linear3(linear2))
        out = torch.sigmoid(self.linear4(linear3))
        return out
    
    def _loss_fn(self, out, y):
        loss = F.binary_cross_entropy(out, y) # Multiclass classification
        return loss
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        out = self(x)
        out = out.squeeze()
        loss = self._loss_fn(out, y.float())
        self.log('loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)      
        return loss

    def test_step(self, batch, batch_idx):
        print("TEST DATA")
        with torch.no_grad():
            x, y = batch
            out = self(x)
            out = out.squeeze()
            loss = self._loss_fn(out, y.float())
            report = classification_report(np.argmax(y, axis=1),np.argmax(out, axis=1),target_names=['is_cause', 'is_treat'])
            print(report)
            
    
    def validation_step(self, batch, batch_idx):
        with torch.no_grad():
            x, y = batch
            out = self(x)
            out = out.squeeze()
            loss = self._loss_fn(out, y.float())
            self.log('val_loss', loss, on_epoch=True, prog_bar=True, logger=True)
    
    def configure_optimizers(self):
        return torch.optim.Adagrad(
            self.parameters(), lr=self.hparams.learning_rate)

In [10]:
class TorchTrainer():
    def __init__(self, model, name, dirpath, dataloaders, max_epochs=50) -> None:
        self.model = model
        self.name = name
        self.dirpath = dirpath
        self.max_epochs = max_epochs
        self.dataloaders = dataloaders

    def run(self):
        logger = TensorBoardLogger(f"{self.dirpath}/tensorboard", name=self.name)
        callbacks = [
            ModelCheckpoint(dirpath=Path(self.dirpath, self.name), monitor="val_loss"),
            EarlyStopping(monitor='loss')
            ]
        trainer = Trainer(deterministic=True, logger=logger, callbacks=callbacks, max_epochs=self.max_epochs)
        trainer.fit(self.model, self.dataloaders['train'], self.dataloaders['val'])
        return trainer

In [11]:
model = TorchModel()

trainer = TorchTrainer(model, 'test', "../tuwnlpie/milestone2/lightning_logs/version_0/checkpoints/" , dataloaders, max_epochs=10)

  rank_zero_deprecation(


In [12]:
the_trainer = trainer.run()

2023-01-12 13:34:52,169 : setup (162) - INFO - GPU available: True (mps), used: False
2023-01-12 13:34:52,214 : setup (165) - INFO - TPU available: False, using: 0 TPU cores
2023-01-12 13:34:52,215 : setup (168) - INFO - IPU available: False, using: 0 IPUs
2023-01-12 13:34:52,215 : setup (171) - INFO - HPU available: False, using: 0 HPUs
  rank_zero_warn(
2023-01-12 13:34:52,220 : model_summary (83) - INFO - 
  | Name               | Type                    | Params
---------------------------------------------------------------
0 | wordEmbeddings     | Embedding               | 1.2 M 
1 | positionEmbeddings | Embedding               | 4.4 K 
2 | transformerLayer   | TransformerEncoderLayer | 707 K 
3 | linear1            | Linear                  | 9.7 K 
4 | linear2            | Linear                  | 65    
5 | linear3            | Linear                  | 1.8 K 
6 | linear4            | Linear                  | 34    
-----------------------------------------------------------

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


KeyError: 0