<a href="https://colab.research.google.com/github/aakashagarwal6898/Emotion-Detection-using-Keras/blob/master/Ocular_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install spacy==2.1.4 #TO AVOID CASCADED ENTITY  ERROR
#!pip install -U spacy[cuda100]==2.1.1 #GPU enabled spacy

In [0]:
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
import re
import spacy


In [0]:
def convert_dataturks_to_spacyy(dataturks_JSON_FilePath):

    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            data_annotations = data['annotation']
            if data_annotations is not None:
                for annotation in data_annotations:
                    #only a single point in text annotation.
                    point = annotation['points'][0]
                    labels = annotation['label']
                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']
                        
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1 , label))
            training_data.append((text, {"entities" : entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None
        

In [0]:
import re


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [0]:
import spacy
import random

def train_spacy():

    TRAIN_DATA = trim_entity_spans(convert_dataturks_to_spacyy("/content/ADIL_dataset_800.json"))
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(20):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.3,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
            nlp.to_disk("/content/drive/My Drive/ocular_final_training/ner_100")
    return nlp
 

In [10]:
nlpModel = train_spacy()

Starting iteration 0
{'ner': 30550.171418521782}
Starting iteration 1
{'ner': 20927.656434691595}
Starting iteration 2
{'ner': 17521.76404734125}
Starting iteration 3
{'ner': 14847.324780167994}
Starting iteration 4
{'ner': 12927.412013169069}
Starting iteration 5
{'ner': 11870.408442600332}
Starting iteration 6
{'ner': 11260.5792119189}
Starting iteration 7
{'ner': 11004.502749313011}
Starting iteration 8
{'ner': 9732.409349565587}
Starting iteration 9
{'ner': 9114.690577490934}
Starting iteration 10
{'ner': 8427.844530666303}
Starting iteration 11
{'ner': 8907.852352691636}
Starting iteration 12
{'ner': 8593.68406971053}
Starting iteration 13
{'ner': 7857.249988236488}
Starting iteration 14
{'ner': 7998.509769759586}
Starting iteration 15
{'ner': 7160.963743739495}
Starting iteration 16
{'ner': 7469.112571175714}
Starting iteration 17
{'ner': 6696.6775124185615}
Starting iteration 18
{'ner': 6325.37412433198}
Starting iteration 19
{'ner': 6526.7565269039}


In [13]:
# Opening txt file
data = "/content/apred-test.txt"
f = open(data, "r")
# Storing data in variable
textToPredict = f.read()
# Sending textual data to Spacy model for NER
doc = nlpModel(textToPredict)
max_amt = 0
i = 1
data = {}
items_list = []
# Iterating over every entitiy to create a dictionary
for ent in doc.ents:
  # Saving only one instance of Total Bill Amount
  if (ent.label_ == "Total bill amount"):
    try:
      amt = float(ent.text)
      if amt > max_amt:
        data["Total bill amount"] = amt
    except Exception as e:
      pass
  # Creating a list of Items
  elif (ent.label_ == "Items"):
    try:
      items_list.append(ent.text)
    except Exception as e:
      print(e)
  # Checking if the detected key is already present in the key,
  # If yes then we create a new key to store that value instead of overwriting the previous one
  else:
    if ent.label_ in data.keys():
      data[ent.label_+"-"+str(i)] = ent.text
      i +=1
    else:
      data[ent.label_] = ent.text
# Staring the list of items using the Items key in the dictionary
data["Items"]=items_list
# Sorting all the elements of the dictionary
data = dict(sorted(data.items()))
# Printing final result
print(json.dumps(data, indent=2))


{
  "Date": "04/13/18",
  "Invoice number": "415",
  "Items": [
    "DINNER RODIZIO",
    "ROTI",
    "Paneer",
    "Kheer",
    "coca cola"
  ],
  "Store address": "2023 South Pine Avenue\nOcala, Florida 34471",
  "Store name": "Brazilian Steak House",
  "Time": "09:16 pm"
}
