**bold text**#Setup

Clone Git Repo with Name Entity Recognition model and code to access custom libraries 

In [None]:
!git clone https://github.com/annielarkins/reddit_ner
!pip install -r ./reddit_ner/requirements.txt -q
!pip install datasets -q
import numpy as np
import torch
import transformers
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from reddit_ner import reddit_ner_tokens as get_tokens
from os.path import exists

from sklearn.metrics import roc_curve,confusion_matrix,auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from itertools import groupby
from operator import itemgetter
from transformers import pipeline

Cloning into 'reddit_ner'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 27 (delta 6), reused 23 (delta 5), pack-reused 0[K
Unpacking objects: 100% (27/27), done.
[K     |████████████████████████████████| 311 kB 5.7 MB/s 
[K     |████████████████████████████████| 3.5 MB 32.3 MB/s 
[K     |████████████████████████████████| 43 kB 178 kB/s 
[K     |████████████████████████████████| 1.1 MB 33.0 MB/s 
[K     |████████████████████████████████| 133 kB 33.3 MB/s 
[K     |████████████████████████████████| 67 kB 4.0 MB/s 
[K     |████████████████████████████████| 243 kB 31.4 MB/s 
[K     |████████████████████████████████| 596 kB 24.4 MB/s 
[K     |████████████████████████████████| 6.8 MB 38.4 MB/s 
[K     |████████████████████████████████| 895 kB 32.5 MB/s 
[K     |████████████████████████████████| 94 kB 1.6 MB/s 
[K     |████████████████████████████████| 271 kB 32.7 MB/s

Install all requirements (quiet)

Import all libraries 

#Train


Check if file from trained model exists, if it doesn't train a new model. 

From train_reddit_ner.py

In [None]:
class CreateModel: 
    def __init__(self, model_checkpoint="bert-base-uncased", verbose = False): 
        self.verbose = verbose
        self.final_eval = False
        self.model_checkpoint = model_checkpoint

        # Metrics to return
        self.precision_scores = []
        self.recall_scores = []
        self.f1_scores = []
        self.accuracy_scores = []
        self.cm = []

        self.label_list = [
            'NA',       # not highlighted by labels
            'thing',  # noun
            'description',  # adjective
            'action'   # verb
            ]
        self.label_encoding_dict = {'NA': 0, 'thing': 1, 'description': 2, 'action': 3}
        self.task = "ner" # named entity recognition
        #TODO: Make the training path an arg

    def createModel(self):     
        log_level = 'warning'
        if self.verbose: 
            log_level = 'info'
            print("Torch Cuda Available: %s" % torch.cuda.is_available())
        batch_size = 16
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_checkpoint)
        #TODO: Change get_un_token_dataset to get dataset
        train_dataset, test_dataset = get_tokens.get_un_token_dataset('./reddit_ner/data/train/', './reddit_ner/data/test/')

        train_tokenized_datasets = train_dataset.map(self.tokenize_and_align_labels, batched=True)
        test_tokenized_datasets = test_dataset.map(self.tokenize_and_align_labels, batched=True)

        model = AutoModelForTokenClassification.from_pretrained(self.model_checkpoint, num_labels=len(self.label_list))

        args = TrainingArguments(
            f"test-{self.task}",
            evaluation_strategy = "epoch",
            learning_rate=1e-4,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=5,
            weight_decay=0.00001,
            log_level = log_level, 
        )

        data_collator = DataCollatorForTokenClassification(self.tokenizer)
        self.metric = load_metric("seqeval")
        trainer = Trainer(
            model,
            args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=test_tokenized_datasets,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )
        print("About to train")
        trainer.train()
        print("Trained")

        print("about to eval")
        self.final_eval = True
        trainer.evaluate()

        trainer.save_model('./reddit_ner/reddit-ner' + self.model_checkpoint  +'.model')
        print("about to return")
        return self.precision_scores, self.recall_scores, self.f1_scores, self.accuracy_scores, self.cm


    def tokenize_and_align_labels(self, examples):
        label_all_tokens = True
        tokenized_inputs = self.tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

        labels = []
        for i, label in enumerate(examples[f"{self.task}_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                elif label[word_idx] == '0':
                    label_ids.append(0)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(self.label_encoding_dict[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(self.label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs


    def compute_metrics(self, p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = self.metric.compute(predictions=true_predictions, references=true_labels)

        if not self.final_eval:
          self.precision_scores.append(results["overall_precision"])
          self.recall_scores.append(results["overall_recall"])
          self.f1_scores.append(results["overall_f1"])
          self.accuracy_scores.append(results["overall_accuracy"])
        else:
          flat_preds = [item for sublist in true_predictions for item in sublist]
          flat_labels = [item for sublist in true_labels for item in sublist]
          # Actual on left, prediction on top
          self.cm = confusion_matrix(flat_preds, flat_labels, labels=["NA", "thing", "description", "action"])
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    


# Determining the Best Model Checkpoint

In [None]:
models = ['microsoft/layoutlm-base-uncased', 'distilbert-base-uncased', 'bert-large-cased', 'xlnet-base-cased']
accuracies = []
recalls = []
cms = []


for model in models:
  c_model = CreateModel(model_checkpoint=model, verbose=False)
  print(c_model.createModel())
  precision_scores, recall_scores, f1_scores, accuracy_scores, cm = c_model.createModel()

  # Metrics Graph
  num_points = len(f1_scores)
  plt.plot(range(1, num_points + 1), precision_scores, label = "Precision")
  plt.plot(range(1, num_points + 1), recall_scores, label = "Recall")
  plt.plot(range(1, num_points + 1), accuracy_scores, label = "Accuracy")
  plt.plot(range(1, num_points + 1), f1_scores, label = "F1 Score")
  plt.title("Performance Metrics")
  plt.xlabel("Epoch")
  plt.legend()
  plt.savefig(model.replace('/', '-') +'-metrics.png', bbox_inches='tight')  
  plt.show()    
  print(cm) 

  accuracies.append(accuracy_scores)
  recalls.append(recall_scores)
  cms.append(cm)

Downloading:   0%|          | 0.00/170 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/432M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

About to train




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
# Overview Plots

# Accuracy
plt.figure()
for i in range(len(models)):
  plt.plot(range(1, num_points + 1), accuracies[i], label = models[i])
plt.legend()
plt.title("Model Accuracies")
plt.xlabel("Epoch")
plt.savefig('acc-model-comparison.png', bbox_inches='tight')  
plt.show()

# Recall
plt.figure()
for i in range(len(models)):
  plt.plot(range(1, num_points + 1), recalls[i], label = models[i])
plt.legend()
plt.title("Model Recalls")
plt.xlabel("Epoch")
plt.savefig('recall-model-comparison.png', bbox_inches='tight') 
plt.show()

# True Positives
tp_vals = []
for i in range(len(models)):
  tp = cms[i][0][0] + cms[i][1][1] + cms[i][2][2] + cms[i][3][3]
  tp_vals.append(tp)
plt.figure()
plt.bar(models, tp_vals)
plt.title("Model True Positives")
plt.xlabel("Model")
plt.savefig('tp-model-comparison.png', bbox_inches='tight') 
plt.show()

# Non-NA True Positives
nna_tps = []
for i in range(len(models)):
  tp = cms[i][1][1] + cms[i][2][2] + cms[i][3][3]
  nna_tps.append(tp)
plt.figure()
plt.bar(models, nna_tps)
plt.title("Model True Positives Excluding NA")
plt.xlabel("Model")
plt.savefig('nnatp-model-comparison.png', bbox_inches='tight') 
plt.show()

# Non-NA Accuracies
nna_accs = []
for i in range(len(models)):
  total_preds = 0
  for r in range(1,4):
    for c in range(1,4):
      total_preds += cms[i][r][c]
  nna_accs.append(nna_tps[i] / total_preds)
plt.figure()
plt.bar(models, nna_accs)
plt.title("Model Accuracies Excluding NA")
plt.xlabel("Model")
plt.savefig('nnacc-model-comparison.png', bbox_inches='tight') 
plt.show()

# Predictions


In [None]:
class PredictionModel: 
    def __init__(self, verbose = False): 
        self.label_list = [
            'NA',       # not highlighted by labels
            'thing',  # noun
            'description',  # adjective
            'action'   # verb
        ]
        self.verbose = verbose
        # self.createTokenizerAndModel(model_folder_path)


    def createTokenizerAndModel(self, model_folder_path):
        if self.verbose: 
            print("Torch Cuda Available: %s" % torch.cuda.is_available())
        try: 
            self.tokenizer = AutoTokenizer.from_pretrained(model_folder_path)
            self.model = AutoModelForTokenClassification.from_pretrained(model_folder_path, num_labels=len(self.label_list))
            if self.verbose: 
                print("Model Trained")
        except: 
            print("Model Not Found: %s\n" % model_folder_path)

    def predict(self, sentence, output_csv = None): 
        if not self.model: 
            print("ERROR: MODEL NOT FOUND")
            return
        tokens = self.tokenizer(sentence)
        print("Tokens Created")
        preds = self.model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
        preds = torch.argmax(preds.logits.squeeze(), axis=1)
        print("Prediction made")
        words = self.tokenizer.batch_decode(tokens['input_ids'])
        print("Words done" )
        value_preds = [self.label_list[i] for i in preds]
        if output_csv: 
            pd.DataFrame({'ner': value_preds, 'words': words}).to_csv(output_csv)
            print("Values Printed to %s" % output_csv)
        if self.verbose: 
            print(pd.DataFrame({'ner': value_preds, 'words': words}))
            print("\nDone")
        return pd.DataFrame({'ner': value_preds, 'words': words})



In [None]:
c_model = CreateModel(model_checkpoint='xlnet-base-cased', verbose=False)
precision_scores, recall_scores, f1_scores, accuracy_scores, cm = c_model.createModel()

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForTokenClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForTokenClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

About to train




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.16177,0.543925,0.572647,0.557917,0.939706
2,No log,0.170766,0.577397,0.552968,0.564919,0.94113
3,No log,0.203781,0.587434,0.579534,0.583457,0.943266
4,No log,0.226032,0.560868,0.584782,0.572575,0.940493
5,No log,0.268374,0.571294,0.595277,0.583039,0.942067


  _warn_prf(average, modifier, msg_start, len(result))


Trained
about to eval


about to return


In [None]:
model_folder_path = "./reddit_ner/reddit-nerxlnet-base-cased.model/"
model = PredictionModel(verbose = False)
model.createTokenizerAndModel(model_folder_path); 

404 Client Error: Not Found for url: https://huggingface.co/./reddit_ner/reddit-nerxlnet-base-cased.model//resolve/main/config.json


Model Not Found: ./reddit_ner/reddit-nerxlnet-base-cased.model/



In [None]:
# sentence = "IMO, I would love to see a working CSV of this program"
# sentence2 = ""
# model.predict(sentence = sentence, output_csv='./reddit_ner/test.csv')


model1 = PredictionModel(verbose = False)
model1.setTokenizerAndModel(model.tokenizer, model.model)
model1.predict(sentence = sentence, output_csv='./reddit_ner/test.csv')

Tokens Created
Prediction made
Words done
Values Printed to ./reddit_ner/test.csv


Unnamed: 0,ner,words
0,,I
1,,MO
2,,","
3,,I
4,,would
5,,love
6,,to
7,,see
8,,a
9,,working


In [None]:
# Evaluate on abstracts
abstract1 = """Many algorithms have been recently developed for reducing dimensionality by projecting data onto an intrinsic non-linear manifold. Unfortunately, existing algorithms often lose significant precision in this transformation. Manifold Sculpting is a new algorithm that iteratively reduces dimensionality by simulating surface tension in local neighborhoods. We present several experiments that show Manifold Sculpting yields more accurate results than existing algorithms with both generated and natural data-sets. Manifold Sculpting is also able to benefit from both prior dimensionality reduction efforts."""
ab1_kw = model.predict(sentence = abstract1, output_csv='./reddit_ner/abstract_test1.csv')

abstract2 = """Surface diffusion of tungsten adatoms on several smooth, low‐index planes of the tungsten lattice has for the first time been followed by direct observation of individual atoms in the field‐ion microscope. Contrary to expectation, the mobility at room temperature is found to increase. Migrating atoms are reflected at the boundaries of the planes; motion along atomic rows is favored over diffusion across lattice steps. From quantitative determinations of the rate of change of the mean‐square displacement, diffusion coefficients are obtained"""
ab2_kw = model.predict(sentence = abstract2, output_csv='./reddit_ner/abstract_test2.csv')

abstract3 = """We present three systems for surface natural language generation that are trainable from annotated corpora. The first two systems, called NLG1 and NLG2, require a corpus marked only with domain-specific semantic attributes, while the last system, called NLG3, requires a corpus marked with both semantic attributes and syntactic dependency information. All systems attempt to produce a grammatical natural language phrase from a domain-specific semantic representation. NLG1 serves a baseline system and uses phrase frequencies to generate a whole phrase in one step, while NLG2 and NLG3 use maximum entropy probability models to individually generate each word in the phrase. The systems NLG2 and NLG3 learn to determine both the word choice and the word order of the phrase. We present experiments in which we generate phrases to describe flights in the air travel domain."""
ab3_kw = model.predict(sentence = abstract3, output_csv='./reddit_ner/abstract_test3.csv')

Values Printed to ./reddit_ner/abstract_test1.csv
Values Printed to ./reddit_ner/abstract_test2.csv
Values Printed to ./reddit_ner/abstract_test3.csv


## Check to see if the word is in the dictionary

In [None]:
#!setup.py install
!pip install nltk

#%pip install nltk
#nltk.download()



[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

True

##Model Saving


In [None]:

# I think you only have to do this once....
#https://huggingface.co/docs/transformers/model_sharing

!pip install huggingface_hub 
# Or use transformers-cli if you have transformers
# Log in using the same credentials as huggingface.co/join

# !huggingface-cli login
# !git config --global credential.helper store

# Create a model repo from the CLI if needed
# !huggingface-cli repo create KeywordIdentifier
!sudo apt-get install git-lfs
##CHOOSE WHICH ONE TO SAVE 

# c_model = CreateModel(model_checkpoint='xlnet-base-cased', verbose = False)
# precision_scores, recall_scores, f1_scores, accuracy_scores, cm = c_model.createModel()

#Is this a terrible idea? Probably. 
token = "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"
# model_folder_path = "./reddit_ner/reddit-nerxlnet-base-cased.model/"
# model = AutoModel.from_pretrained(model_folder_path) #, num_labels=len(self.label_list))
model.model.push_to_hub("jasminejwebb/KeywordIdentifier", use_auth_token = token)
model.tokenizer.push_to_hub("jasminejwebb/KeywordIdentifier", use_auth_token = token)
print("Pushed to hub")


Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1
  cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1
  cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1
  cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0
  cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0
  cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0
  cuda-nsight-compute-10-1 cuda-nsight-compute-11-0 cuda-nsight-compute-11-

# Model Loading


In [None]:
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
# Evaluate on abstracts
model_folder_path = "jasminejwebb/KeywordIdentifier"

load_model = AutoModelForTokenClassification.from_pretrained(model_folder_path, num_labels=4)
load_tokenizer = AutoTokenizer.from_pretrained(model_folder_path)

test_model = PredictionModel(verbose = False)
test_model.createTokenizerAndModel(model_folder_path)


Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

In [None]:

abstract1 = """Many algorithms have been recently developed for reducing dimensionality by projecting data onto an intrinsic non-linear manifold. Unfortunately, existing algorithms often lose significant precision in this transformation. Manifold Sculpting is a new algorithm that iteratively reduces dimensionality by simulating surface tension in local neighborhoods. We present several experiments that show Manifold Sculpting yields more accurate results than existing algorithms with both generated and natural data-sets. Manifold Sculpting is also able to benefit from both prior dimensionality reduction efforts."""
ab1_kw = test_model.predict(sentence = abstract1, output_csv='./reddit_ner/abstract_test1.csv')

abstract2 = """Surface diffusion of tungsten adatoms on several smooth, low‐index planes of the tungsten lattice has for the first time been followed by direct observation of individual atoms in the field‐ion microscope. Contrary to expectation, the mobility at room temperature is found to increase. Migrating atoms are reflected at the boundaries of the planes; motion along atomic rows is favored over diffusion across lattice steps. From quantitative determinations of the rate of change of the mean‐square displacement, diffusion coefficients are obtained"""
ab2_kw = test_model.predict(sentence = abstract2, output_csv='./reddit_ner/abstract_test2.csv')

abstract3 = """We present three systems for surface natural language generation that are trainable from annotated corpora. The first two systems, called NLG1 and NLG2, require a corpus marked only with domain-specific semantic attributes, while the last system, called NLG3, requires a corpus marked with both semantic attributes and syntactic dependency information. All systems attempt to produce a grammatical natural language phrase from a domain-specific semantic representation. NLG1 serves a baseline system and uses phrase frequencies to generate a whole phrase in one step, while NLG2 and NLG3 use maximum entropy probability models to individually generate each word in the phrase. The systems NLG2 and NLG3 learn to determine both the word choice and the word order of the phrase. We present experiments in which we generate phrases to describe flights in the air travel domain."""
ab3_kw = test_model.predict(sentence = abstract3, output_csv='./reddit_ner/abstract_test3.csv')

Tokens Created
Prediction made
Words done
Values Printed to ./reddit_ner/abstract_test1.csv
Tokens Created
Prediction made
Words done
Values Printed to ./reddit_ner/abstract_test2.csv
Tokens Created
Prediction made
Words done
Values Printed to ./reddit_ner/abstract_test3.csv
