https://towardsdatascience.com/simple-transformers-named-entity-recognition-with-transformer-models-c04b9242a2a0
https://towardsdatascience.com/hyperparameter-optimization-for-optimum-transformer-models-b95a32b70949
https://huggingface.co/transformers/pretrained_models.html

# Installing Libraries

In [1]:
!pip install transformers
!pip install simpletransformers
!pip install tensorboardx
!pip install seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 5.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 17.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 41.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=6114bf9d894



# Importing Libraries

In [2]:
import re
import json
import pandas as pd
import numpy as np
import string
from scipy.special import softmax
from simpletransformers.ner import NERModel

In [3]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# Loading Data

In [5]:
def cleanText(text):
  text = text.lower().strip()
  text = re.sub("\n+", " ", text)
  text = re.sub(" +", " ", text)
  text = re.sub("\t+", " ", text)
  #text = re.sub("[^A-Za-z0-9\'\ ]", "", text)  # giving error -> try fixing
  return text

In [6]:
with open ("/content/drive/MyDrive/Colab Notebooks/dataset.json") as d:
  dfd_json = json.load(d)

In [7]:
poems = list()
haikus = list()
indices = list()

MIN_THRESHOLD_HAIKU = 5
MAX_THRESHOLD_POEM = 120

for dataset in dfd_json:
  for poem in dataset:
    for grammar_index in poem:
      if grammar_index == "poem":
        continue
      else:
        haiku_data = poem[grammar_index]
        for haiku, index in list(haiku_data.items()):
          if len(index) >= MIN_THRESHOLD_HAIKU and len(poem["poem"].split()) <= MAX_THRESHOLD_POEM:
            poems.append(poem["poem"])
            haikus.append(haiku)
            indices.append(index)

In [8]:
len(poems), len(haikus), len(indices)

(54629, 54629, 54629)

In [9]:
cleaned_poems = list(map(cleanText, poems))

In [10]:
df = pd.DataFrame()
df["poem"] = cleaned_poems
df["cleaned_poem"] = poems
df["haiku"] = haikus
df["indices"] = indices
df = df.drop_duplicates(subset=["poem"])
df = df.reset_index(drop=True)

In [11]:
df.head()

Unnamed: 0,poem,cleaned_poem,haiku,indices
0,did the cia tell the fbi that it knows the wor...,Did the CIA tell the FBI that it knows the wor...,cia fbi the biggest weapon,"[2, 5, 9, 24, 25]"
1,"dark clouds gathered overhead, expelling bulle...","Dark clouds gathered overhead,\nExpelling bull...",clouds overhead bullets of the valley,"[1, 3, 5, 6, 10, 11]"
2,a vigilante lacking of heroic qualities that b...,A vigilante lacking of heroic qualities that\n...,lacking qualities that damn criminals,"[2, 5, 6, 11, 12]"
3,"(a diamante poem) brain heavenly, hellish floa...","(A Diamante Poem)\nBrain\nHeavenly, hellish\nF...",diamante poem the sybaritic pathetic,"[1, 2, 10, 18, 19]"
4,can i break tradition? will it be okay? i'll r...,Can I break tradition?\nWill it be okay?\nI'll...,ill speaks a worst condition,"[8, 31, 37, 47, 48]"


# Converting into Word-Tags Format

In [12]:
def clean(s):
  s = s.translate(str.maketrans('', '', string.punctuation)).strip().lower()
  return s

In [13]:
def createWordTagDataFrame(poems, tags):
  poem_no = []
  word = []
  tag = []
  for i in range(len(poems)):
      poem = poems[i].split()
      count_poem = [i+1] * len(poem)
      poem_no.extend(count_poem)
      word.extend(poem)
      count_tag = ['0'] * len(poem)
      for j in tags[i]:
        try:
          count_tag[j] = '1'
        except:
          print(f"{poem}\n{tags[i]}\n\n")
      tag.extend(count_tag)
  word = list(map(clean, word))
  dataset = {"sentence_id" : poem_no, "words" : word, "labels" : tag}
  df = pd.DataFrame(dataset)
  df = df[df["words"] != ""]
  return df

In [14]:
np.random.seed(0)
mask = np.random.rand(len(df)) < 0.8
train_df = df[mask]
test_df = df[~mask]

In [15]:
train_poems = train_df["poem"].values
test_poems = test_df["poem"].values

train_indices = train_df["indices"].values
test_indices = test_df["indices"].values

all_poems = df["poem"].values
all_indices = df["indices"].values

In [16]:
df = createWordTagDataFrame(all_poems, all_indices)
train_df = createWordTagDataFrame(train_poems, train_indices)
test_df = createWordTagDataFrame(test_poems, test_indices)

# Creating Model

In [17]:
model = NERModel("roberta", "roberta-base", args={"overwrite_output_dir": True, "reprocess_input_data": True}, labels = ["0", "1"])

2021-04-11 07:46:02,913 : INFO : Lock 140420728496400 acquired on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

2021-04-11 07:46:03,525 : INFO : Lock 140420728496400 released on /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b.lock
2021-04-11 07:46:04,170 : INFO : Lock 140420447307920 acquired on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

2021-04-11 07:46:18,477 : INFO : Lock 140420447307920 released on /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7.lock
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some we

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

2021-04-11 07:46:25,621 : INFO : Lock 140420441050512 released on /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab.lock
2021-04-11 07:46:26,180 : INFO : Lock 140420441184016 acquired on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

2021-04-11 07:46:27,638 : INFO : Lock 140420441184016 released on /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
2021-04-11 07:46:28,207 : INFO : Lock 140420441244560 acquired on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2021-04-11 07:46:30,124 : INFO : Lock 140420441244560 released on /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730.lock


In [18]:
model.train_model(train_df)

2021-04-11 07:46:43,081 : INFO :  Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1626 [00:00<?, ?it/s]

2021-04-11 08:10:02,516 : INFO :  Training of roberta model complete. Saved to outputs/.


(1626, 0.20052305548045174)

# Inference

In [19]:
def displayPredictions(predictions, fname):
  data = list()
  for p in predictions:
    poem = []
    haiku = []
    for d in p:
      for word, tag in d.items():
        poem.append(word)
        if tag == '1':
          haiku.append(word)
    if len(haiku) >=4:
      poem = " ".join(poem)
      haiku = " ".join(haiku)
      print(f"Poem:  {poem}")
      print(f"Haiku:  {haiku}\n")
      data.append({"poem": poem, "haiku": haiku})
  
  with open(fname, "w") as f:
    json.dump(data, f)

## Test Set

In [20]:
# result, model_outputs, predictions = model.eval_model(test_df)

In [21]:
predictions, raw_outputs = model.predict(test_poems)

2021-04-11 08:10:02,670 : INFO :  Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/397 [00:00<?, ?it/s]

In [22]:
displayPredictions(predictions, "stf-roberta-test.json")

Poem:  you sit left to me, you don't wanna see, what's really inside me. do you want to hurt me? you could tell it everyone! would it make you feel better? you don't know anything about me. my new name for you is 'abc' i don't need lipstick for attention, i also never owned extentions. the only problem i ever had was you and what you used to do. and the last thing i tell you about me, is why you're so much weaker than me: you only blance on a tightrope that's lying on the floor,
Haiku:  name the only problem

Poem:  alfons schuhbeck top chef. bavaria, germany is one of germany's leading chefs. chef with a wonderful cookbook. dreaming at the lake. eternity of pepper or garlic. fried or baked. german cook to those that can afford food. honesty, a life of a chef.
Haiku:  alfons a wonderful cookbook.

Poem:  a woman's poet with poems about women. beautiful thoughts of a woman. catherine m. wilsonis a poet. dreaming of journeys of the heart. ever a hero's tale. finding a warrior's path. gir

## Train Set

In [23]:
# result, model_outputs, predictions = model.eval_model(train_df)

In [24]:
predictions, raw_outputs = model.predict(train_poems)

2021-04-11 08:12:27,783 : INFO :  Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1626 [00:00<?, ?it/s]

In [25]:
displayPredictions(predictions, "stf-roberta-train.json")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Haiku:  thanks thanks thanks fears.

Poem:  i am free from all the things i have done to me i am free to fly to spread my wings and soar the skies i feel my heart has come to life i feel as if i must be high for i have never felt this way what is this thing that liberates? success...... is the answer. i thanks allah for all he has done to me, thanks for giving me my moon....
Haiku:  things the this way

Poem:  - * thanks for the memories * - you used to send me little things that brightened up my day. you'd send a joke or poem that you'd found along the way. or you'd send a little message that you'd written just for me. how i miss that little kindness, that i no longer see. but you, i'll ere remember until my days are done because in my mind you'll always be my friend-you're number one. thanks for the memories...... author: carolyn ford witt ms. caroline ©55117 10-29-05
Haiku:  thanks memories little things that up

Poem:

## Testing on Train

In [26]:
model = NERModel("roberta", "roberta-base", args={"overwrite_output_dir": True, "reprocess_input_data": True}, labels = ["0", "1"])
model.train_model(df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/2022 [00:00<?, ?it/s]

2021-04-11 09:26:42,122 : INFO :  Training of roberta model complete. Saved to outputs/.


(2022, 0.1976333907985015)

In [27]:
# result, model_outputs, predictions = model.eval_model(df)

In [28]:
predictions, raw_outputs = model.predict(all_poems)

2021-04-11 09:26:42,632 : INFO :  Converting to features started.


  0%|          | 0/2 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/2022 [00:00<?, ?it/s]

In [29]:
displayPredictions(predictions, "stf-roberta-all.json")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Haiku:  clouds stormy sun as

Poem:  ‘the myrtle bush grew shady down by the ford.’ ‘is it even so?’ said my lady. ‘even so!’ said my lord. ‘the leaves are set too thick together for the point of a sword. ‘the arras in your room hangs close, no light between! you wedded one of those that see unseen.’ ‘is it even so?’ said the king’s majesty. ‘even so!’ said the queen.
Haiku:  bush leaves the arras no

Poem:  why is my verse so barren of new pride, so far from variation or quick change? why with the time do i not glance aside to new-found methods, and to compounds strange? why write i still all one, ever the same, and keep invention in a noted weed, that every word doth almost tell my name, showing their birth, and where they did proceed? o know, sweet love, i always write of you, and you and love are still my argument, so all my best is dressing old words new, spending again what
Haiku:  verse all same, invention

Poem:  

In [30]:
import os
from google.colab import files

for f in os.listdir():
  if f.endswith(".json"):
    files.download(f)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>