In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [3]:
df_train, df_test

(        idx         token   type
 0         0           via      O
 1         0       willing      O
 2         0        remove   B-RT
 3         0          todo   I-RT
 4         0          with      O
 ...     ...           ...    ...
 62147  1999           add   B-AT
 62148  1999          todo   I-AT
 62149  1999          with      O
 62150  1999       content      O
 62151  1999  accidentally  B-CNT
 
 [62152 rows x 3 columns],
        idx        token   type
 0        0         with      O
 1        0        under      O
 2        0      destroy   B-RT
 3        0         todo   I-RT
 4        0   containing      O
 ...    ...          ...    ...
 31547  999     provided  I-CNT
 31548  999          via  I-CNT
 31549  999  furthermore  I-CNT
 31550  999    misguided  I-CNT
 31551  999         well  I-CNT
 
 [31552 rows x 3 columns])

In [4]:
df_train.rename(columns={"idx":"sentence_id","token":"words","type":"labels"}, inplace =True)
df_test.rename(columns={"idx":"sentence_id","token":"words","type":"labels"}, inplace =True)

In [5]:
df_train, df_test

(       sentence_id         words labels
 0                0           via      O
 1                0       willing      O
 2                0        remove   B-RT
 3                0          todo   I-RT
 4                0          with      O
 ...            ...           ...    ...
 62147         1999           add   B-AT
 62148         1999          todo   I-AT
 62149         1999          with      O
 62150         1999       content      O
 62151         1999  accidentally  B-CNT
 
 [62152 rows x 3 columns],
        sentence_id        words labels
 0                0         with      O
 1                0        under      O
 2                0      destroy   B-RT
 3                0         todo   I-RT
 4                0   containing      O
 ...            ...          ...    ...
 31547          999     provided  I-CNT
 31548          999          via  I-CNT
 31549          999  furthermore  I-CNT
 31550          999    misguided  I-CNT
 31551          999         well  I-CNT

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
df_train["sentence_id"] = LabelEncoder().fit_transform(df_train["sentence_id"] )
df_test["sentence_id"] = LabelEncoder().fit_transform(df_test["sentence_id"] )

In [8]:
df_train["labels"] = df_train["labels"].str.upper()
df_test["labels"] = df_test["labels"].str.upper()

In [9]:
df_train

Unnamed: 0,sentence_id,words,labels
0,0,via,O
1,0,willing,O
2,0,remove,B-RT
3,0,todo,I-RT
4,0,with,O
...,...,...,...
62147,1999,add,B-AT
62148,1999,todo,I-AT
62149,1999,with,O
62150,1999,content,O


In [10]:
df_test

Unnamed: 0,sentence_id,words,labels
0,0,with,O
1,0,under,O
2,0,destroy,B-RT
3,0,todo,I-RT
4,0,containing,O
...,...,...,...
31547,999,provided,I-CNT
31548,999,via,I-CNT
31549,999,furthermore,I-CNT
31550,999,misguided,I-CNT


In [11]:
from simpletransformers.ner import NERModel,NERArgs

In [12]:
label = df_train["labels"].unique().tolist()
label

['O', 'B-RT', 'I-RT', 'B-CNT', 'I-CNT', 'B-CT', 'I-CT', 'B-AT', 'I-AT']

In [13]:
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32

In [14]:
# models
# bert-base-cased
# bert-base-uncased
# roberta-base
# roberta-large
model = NERModel('roberta', 'roberta-base',labels=label,args=args, use_cuda=False)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.train_model(df_train,eval_data=df_test,acc=accuracy_score)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/63 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/63 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/63 [00:00<?, ?it/s]

(189, 0.1884697288340672)

In [16]:
result, model_outputs, preds_list = model.eval_model(df_test)

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/32 [00:00<?, ?it/s]

In [17]:
result

{'eval_loss': 0.001330210311607516,
 'precision': 0.99860529986053,
 'recall': 0.9994416527079844,
 'f1_score': 0.9990233012418028}

In [18]:
def modifyPrompt(p = ""):
  p = p.replace("'", " ' ")
  p = p.replace("\"", " ' ")
  return p

In [19]:
prompt = "I want you to add a todo 'go to your home'"
prediction, model_output = model.predict([modifyPrompt(prompt)])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
prediction[0]

[{'I': 'O'},
 {'want': 'O'},
 {'you': 'O'},
 {'to': 'O'},
 {'add': 'B-AT'},
 {'a': 'I-AT'},
 {'todo': 'I-AT'},
 {"'": 'O'},
 {'go': 'B-CNT'},
 {'to': 'I-CNT'},
 {'your': 'I-CNT'},
 {'home': 'I-CNT'},
 {"'": 'O'}]

In [21]:
out = []
for p in prediction[0]:
  x = list(p.values())[0]
  y = list(p.keys())[0]

  if (x != "O"):
    a = x.split("-")
    match a[1]:
      case "AT" | "RT":
        if (a[0] == "B"):
            dict = {}
            dict["TYPE"] = a[1]
            dict["CNT"] = ""
            out.append(dict)
      case "CNT":
        if (len(out) == 0):
          print("type is missing!")
        else:
          if (out[-1]["CNT"] == ""):
            out[-1]["CNT"] = y
          else:
            out[-1]["CNT"] += " " + y
      case _:
        pass

out

[{'TYPE': 'AT', 'CNT': 'go to your home'}]

# production model

In [22]:
from simpletransformers.ner import NERModel,NERArgs
import pandas as pd

df_t = pd.read_csv('./data-generator/train.csv')

label = df_t["type"].unique().tolist()
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32

model = NERModel('roberta', './outputs/checkpoint-189-epoch-3',labels=label,args =args, use_cuda=False)
def modifyPrompt(p = ""):
  p = p.replace("'", " ' ")
  p = p.replace("\"", " ' ")
  return p

def makePrediction(prompt):
    prediction, model_output = model.predict([modifyPrompt(prompt)])

    pred = prediction[0]
    idx = 0
    n = len(pred)
    for c in pred:
        x = list(c.values())[0]
        y = list(c.keys())[0]
    
        if idx > 0 and idx < n-1:
            l = list(pred[idx-1].values())[0]
            r = list(pred[idx+1].values())[0]
            if  x == "O" and l != "O" and r != "O" and l.split("-")[1] == "CNT" and r.split("-")[1] == "CNT":
                pred[idx][y] = "I-CNT"
    
        idx += 1
    
    out = []
    for p in pred:
      x = list(p.values())[0]
      y = list(p.keys())[0]
    
      if (x != "O"):
        a = x.split("-")
        match a[1]:
          case "AT" | "RT":
            if (a[0] == "B"):
                dict = {}
                dict["TYPE"] = a[1]
                dict["CNT"] = ""
                out.append(dict)
          case "CNT":
            if (len(out) == 0):
              print("type is missing!")
            else:
              if (out[-1]["CNT"] == ""):
                out[-1]["CNT"] = y
              else:
                out[-1]["CNT"] += " " + y
          case _:
            pass

    return out

In [23]:
makePrediction("remove todo 'to your home'")

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'TYPE': 'RT', 'CNT': 'to your home'}]