In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
df = pd.read_csv('coarse-and-fine-grained-ner-dataset.csv')

In [4]:
df.head()

Unnamed: 0,Text,Organ Entities,Descriptor Entities,Coarse-grained Annotation,Fine-grained Annotation
0,"grandes feuilles opposées, oblongues-elliptiq...","['bouton', 'pédicelle', 'corolle', 'tube', 'fe...","['fermée', 'pubes-cents', 'cunéiformes', 'vent...","[(650, 661, 'DESCRIPTEUR'), (968, 977, 'DESCRI...","[(650, 661, 'DESCRIPTEUR'), (395, 407, 'DISPOS..."
1,"feuilles opposées, groupées à l'extrémité des...","['limbe', 'style', 'filets', 'rameaux', 'sépal...","['elliptiques', '1 cm de longueur', 'extrorses...","[(609, 618, 'DESCRIPTEUR'), (129, 144, 'DESCRI...","[(609, 618, 'DESCRIPTEUR'), (73, 80, 'FORME'),..."
2,"feuilles opposées, obovées oblongues, arrondi...","['corolle', 'limbe', 'ovaire', 'lobes', 'base'...","['cunéiforme', '10,5 mm de longueur', 'long', ...","[(60, 64, 'ORGANE'), (196, 205, 'DESCRIPTEUR')...","[(60, 64, 'ORGANE'), (180, 187, 'ORGANE'), (11..."
3,"arbustes petites feuilles opposées, groupées...","['anthères', 'pétales', 'tube', 'feuilles', 's...","['secondaires', 'accusé', 'saillantes', 'apicu...","[(949, 959, 'DESCRIPTEUR'), (88, 105, 'DESCRIP...","[(949, 959, 'DESCRIPTEUR'), (88, 105, 'DESCRIP..."
4,"arbustes feuilles opposées ou alternes, obla...","['base', 'nervure', 'feuilles', 'arbustes', 'l...","['proéminente', 'décurrente', 'alternes', 'obl...","[(140, 150, 'DESCRIPTEUR'), (32, 40, 'DESCRIPT...","[(42, 54, 'FORME'), (119, 129, 'FORME'), (1, 9..."


## First step :

We are going to create tagged words. 
In "Coarse-grained Annotation" we have tuples each one has : (start position, end position, TAG ).
In text we are going to initialize all words to "O" after processing using the start and end position of the tags, we will split the text into words and map the tags to words. If a word contains any tagged characters, assign the first non-"O" tag to the word.

In [5]:
import ast

def parse_annotations(annotation_str):
    # Convert string to list of tuples
    return ast.literal_eval(annotation_str)

def generate_ner_tags(text, annotations):
    tags = ["O"] * len(text)
    
    for start, end, label in annotations:
        tags[start] = f"B-{label}"
        for i in range(start + 1, end):
            tags[i] = f"I-{label}"
    
    words = text.split()
    word_tags = []
    current_idx = 0

    for word in words:
        word_start = current_idx
        word_end = current_idx + len(word)
        word_tag = tags[word_start:word_end]
        
        if all(tag == "O" for tag in word_tag):
            word_tags.append((word, "O"))
        else:
            non_o_tags = [tag for tag in word_tag if tag != "O"]
            word_tags.append((word, non_o_tags[0]))
        
        current_idx = word_end + 1  # Move index to the next word

    return word_tags

tagged_sentences = []

for index, row in df.iterrows():
    text = row["Text"]
    annotations = parse_annotations(row["Fine-grained Annotation"])  # Parse annotation string
    word_tags = generate_ner_tags(text, annotations)
    tagged_sentences.append((word_tags))


In [6]:
tagged_sentences[0]

[('grandes', 'O'),
 ('feuilles', 'B-ORGANE'),
 ('opposées,', 'B-DISPOSITION'),
 ('oblongues-elliptiques', 'B-DESCRIPTEUR'),
 ('ou', 'O'),
 ('obovées-elliptiques,', 'B-DESCRIPTEUR'),
 ('arrondies', 'O'),
 ('au', 'O'),
 ('sommet,', 'O'),
 ('obtuses', 'B-FORME'),
 ('ou', 'O'),
 ('cunéiformes', 'B-FORME'),
 ('à', 'O'),
 ('la', 'O'),
 ('base', 'B-ORGANE'),
 ('limbe', 'B-ORGANE'),
 ('glabre,', 'I-ORGANE'),
 ('mesurant', 'O'),
 ("jusqu'à", 'O'),
 ('20', 'O'),
 ('cm', 'I-MESURE'),
 ('de', 'I-MESURE'),
 ('longueur', 'I-MESURE'),
 ('sur', 'I-MESURE'),
 ('12', 'O'),
 ('cm', 'I-MESURE'),
 ('de', 'I-MESURE'),
 ('largeur', 'I-MESURE'),
 ('nervure', 'I-MESURE'),
 ('médiane', 'I-ORGANE'),
 ('proéminente', 'B-DESCRIPTEUR'),
 ('dessous,', 'I-DESCRIPTEUR'),
 ('un', 'O'),
 ('peu', 'O'),
 ('saillante', 'B-SURFACE'),
 ('dessus', 'I-SURFACE'),
 ('nervures', 'O'),
 ('secondaires,', 'B-POSITION'),
 ('5', 'I-POSITION'),
 ('à', 'O'),
 ('10', 'O'),
 ('paires,', 'O'),
 ('incurvées,', 'B-DESCRIPTEUR'),
 ('réunies',

In [7]:
X = []
Y = []

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:
        X_sentence.append(entity[0].lower())
        Y_sentence.append(entity[1])

    X.append(X_sentence)
    Y.append(Y_sentence)

In [8]:
print("Total number of sentences: {}".format(len(X)))

Total number of sentences: 838


In [9]:
vocab = set([word.lower() for sentence in X for word in sentence])
num_words = len(set([word.lower() for sentence in X for word in sentence]))

print("Vocabulary size: {}".format(num_words))

Vocabulary size: 12601


In [10]:
tags = set([word for sentence in Y for word in sentence])
num_tags = len(set([word for sentence in Y for word in sentence]))

print("Total number of tags: {}".format(num_tags))
print("Tags: ", tags)

Total number of tags: 21
Tags:  {'I-COULEUR', 'B-SURFACE', 'I-POSITION', 'I-SURFACE', 'B-DISPOSITION', 'B-POSITION', 'I-STRUCTURE', 'I-MESURE', 'I-DESCRIPTEUR', 'B-COULEUR', 'I-DISPOSITION', 'I-FORME', 'B-FORME', 'O', 'B-MESURE', 'I-ORGANE', 'B-DESCRIPTEUR', 'B-DEVELOPPEMENT', 'B-ORGANE', 'B-STRUCTURE', 'I-DEVELOPPEMENT'}


In [11]:
print('sample X: ', X[0], '\n')
print('sample Y: ', Y[0], '\n')

sample X:  ['grandes', 'feuilles', 'opposées,', 'oblongues-elliptiques', 'ou', 'obovées-elliptiques,', 'arrondies', 'au', 'sommet,', 'obtuses', 'ou', 'cunéiformes', 'à', 'la', 'base', 'limbe', 'glabre,', 'mesurant', "jusqu'à", '20', 'cm', 'de', 'longueur', 'sur', '12', 'cm', 'de', 'largeur', 'nervure', 'médiane', 'proéminente', 'dessous,', 'un', 'peu', 'saillante', 'dessus', 'nervures', 'secondaires,', '5', 'à', '10', 'paires,', 'incurvées,', 'réunies', 'en', 'arceaux', 'assez', 'loin', 'de', 'la', 'marge,', 'saillantes', 'dessous,', 'bien', 'marquées', 'dessus,', 'anastomosées', 'à', 'un', 'réseau', 'de', 'nervilles', 'à', 'grosses', 'mailles', 'irrégulières,', 'finement', 'saillant', 'dessus', 'pétiole', '5-20', 'mm', 'fleurs', 'blanches', 'fasciculées', 'sur', 'le', 'vieux', 'bois', 'pédicelle', '4-6', 'mm,', 'glabre', 'ou', 'légèrement', 'pubescent', 'galice', ':', '4', 'sépales', '(2', '+', '2)', 'de', '2,5', 'mm,', 'un', 'peu', 'pubes-cents', 'extérieurement', 'corolle', 'à', '8'

In [12]:
print("Length of first input sequence  : {}".format(len(X[0])))
print("Length of first output sequence : {}".format(len(Y[0])))

Length of first input sequence  : 173
Length of first output sequence : 173


In [13]:
word_to_ix = {}

for word in vocab:
  if word not in word_to_ix:
    word_to_ix[word] = len(word_to_ix)

print(word_to_ix)

{'py': 0, 'nouées': 1, 'imitant': 2, '(7-11,': 3, 'vaccinioides': 4, '37836': 5, 'grise,': 6, 'nombreuses,': 7, 'floral,': 8, 'équitantes': 9, '10079': 10, 'ses': 11, 'leurs': 12, 'spatulata,': 13, '1,9-2,5': 14, 'article': 15, 'amplexicaules,': 16, 'staminale;': 17, 'glaucescence': 18, 'pubérulence': 19, '11,5-16': 20, 'verticillées': 21, 'réseau': 22, "d'où": 23, '4e': 24, 'obovoïde': 25, '(-38)': 26, 'détail': 27, '8-10-ovulés;': 28, '(nectaire': 29, 'grimpant,': 30, 'infrutescence': 31, 'deltoïde-lancéolé,': 32, 'triangulaires-acuminés,': 33, 'cymeux,': 34, 'saillies': 35, 'constituant': 36, 'u,': 37, 'ensemble,': 38, '12-22': 39, 'crevassée': 40, 'pombellule': 41, 'éventuellement': 42, 'kew': 43, 'marcottage': 44, 'hirsutum,': 45, 'follicules': 46, 'feuilles),': 47, 'florale': 48, 'néoténique,': 49, '3-3,9': 50, 'sulcalum': 51, 'formations': 52, 'arbustes,': 53, 'cachées': 54, 'franchement': 55, '491)': 56, '0,60': 57, '27,': 58, 'falques': 59, 'subaigu': 60, 'surnuméraires': 61, 

In [14]:
tag_to_ix = {}

for tag in tags:
  tag_to_ix[tag] = len(tag_to_ix)

print(tag_to_ix)

{'I-COULEUR': 0, 'B-SURFACE': 1, 'I-POSITION': 2, 'I-SURFACE': 3, 'B-DISPOSITION': 4, 'B-POSITION': 5, 'I-STRUCTURE': 6, 'I-MESURE': 7, 'I-DESCRIPTEUR': 8, 'B-COULEUR': 9, 'I-DISPOSITION': 10, 'I-FORME': 11, 'B-FORME': 12, 'O': 13, 'B-MESURE': 14, 'I-ORGANE': 15, 'B-DESCRIPTEUR': 16, 'B-DEVELOPPEMENT': 17, 'B-ORGANE': 18, 'B-STRUCTURE': 19, 'I-DEVELOPPEMENT': 20}


In [15]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long, device = 'cuda')

In [16]:
from sklearn.model_selection import train_test_split

SPLIT_SIZE = 0.2

# split entire data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=SPLIT_SIZE, random_state=4)

# split training data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=SPLIT_SIZE, random_state=4)

# print number of samples in each set
print("TRAINING DATA")
print('Number of sequences: {}'.format(len(X_train)))
print("-"*50)
print("TESTING DATA")
print('Number of sequences: {}'.format(len(X_test)))
print("-"*50)
print("VALIDATION DATA")
print('Number of sequences: {}'.format(len(X_val)))

TRAINING DATA
Number of sequences: 536
--------------------------------------------------
TESTING DATA
Number of sequences: 168
--------------------------------------------------
VALIDATION DATA
Number of sequences: 134


In [17]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, sentence):
        output = []
        hidden = self.initHidden()
        for input in sentence:
          hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
          out = self.h2o(hidden)
          output += out
        output = torch.stack(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device = 'cuda')

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRU, self).__init__()

        self.hidden_size = hidden_size


        self.x2z = nn.Linear(input_size, hidden_size)
        self.h2z = nn.Linear(hidden_size, hidden_size)
        self.h2r = nn.Linear(hidden_size, hidden_size)
        self.x2r = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.x2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)


    def forward(self, sentence):
        output = []
        hidden = self.initHidden()
        for input in sentence:
          z = F.sigmoid(self.x2z(input) + self.h2z(hidden))
          r= F.sigmoid(self.x2r(input) + self.h2r(hidden))
          hidden1= F.tanh(r*(self.h2h(hidden))+self.x2h(input))
          hidden= (1-z)*(hidden)+ z*(hidden1)
          out = self.h2o(hidden)
          output += out
        output = torch.stack(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device = 'cuda')

In [28]:
class POSTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(POSTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.GRU = GRU(embedding_dim, hidden_dim, tagset_size)


    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        tag_scores, _ = self.GRU(embeds)
        return tag_scores

In [29]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 32
EPOCHS = 50

In [None]:
model = POSTagger(EMBEDDING_DIM, HIDDEN_DIM, num_words, num_tags)
model.cuda()

POSTagger(
  (word_embeddings): Embedding(12601, 64)
  (GRU): GRU(
    (x2z): Linear(in_features=64, out_features=32, bias=True)
    (h2z): Linear(in_features=32, out_features=32, bias=True)
    (h2r): Linear(in_features=32, out_features=32, bias=True)
    (x2r): Linear(in_features=64, out_features=32, bias=True)
    (h2h): Linear(in_features=32, out_features=32, bias=True)
    (x2h): Linear(in_features=64, out_features=32, bias=True)
    (h2o): Linear(in_features=32, out_features=21, bias=True)
  )
)

In [31]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [32]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# torch.no_grad() disables gradient calculation when the model is used for inference

with torch.no_grad():
    inputs = prepare_sequence(X_train[0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

AssertionError: Torch not compiled with CUDA enabled