In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ROOT_DIR="/content/drive/MyDrive/TwitterSupport/"

In [4]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [26]:
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("Virgin America is recognized as leader in market",{"entities": [(0,14, "ORG")]}),
              ("Virgin America is the best airline ever",{"entities": [(0,14, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]}),
              ("Project_test is a great airline.", {"entities" : [(0,12, "ORG")]}),
              ("Project_test is a great airline.", {"entities" : [(0,12, "ORG")]}),
              ("Project_test is a great airline.", {"entities" : [(0,12, "ORG")]}),
              ("Project_test is a great airline.", {"entities" : [(0,12, "ORG")]})
              ]

In [27]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [28]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts, 
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 1.2409774997753848e-06}
Losses {'ner': 7.564576868577404}
Losses {'ner': 7.965053327129136}
Losses {'ner': 11.79635537799411}
Losses {'ner': 11.991754978541255}
Losses {'ner': 11.991759675346035}
Losses {'ner': 13.991911738356393}
Losses {'ner': 4.6712818382330134e-05}
Losses {'ner': 3.9994641600473613}
Losses {'ner': 4.016545987364194}
Losses {'ner': 4.021727853316343}
Losses {'ner': 7.864001582609107}
Losses {'ner': 15.464015749421778}
Losses {'ner': 15.464015749522721}
Losses {'ner': 2.2436237021406678}
Losses {'ner': 2.5148078247114567}
Losses {'ner': 2.5579658120422533}
Losses {'ner': 6.058731504402453}
Losses {'ner': 6.0587441875069015}
Losses {'ner': 6.067545992568892}
Losses {'ner': 6.067545992573585}
Losses {'ner': 5.048134452723666e-07}
Losses {'ner': 5.23104289191792e-07}
Losses {'ner': 4.120752408997419}
Losses {'ner': 8.001748785174872}
Losses {'ner': 8.002194161693797}
Losses {'ner': 8.002194576341603}
Losses {'ner': 8.002194786837602}
Losses {'ner': 1.5698

In [33]:
doc = nlp("is a project for twitter bot auto_replying Project_test")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Project_test', 'ORG')]


In [34]:
# Save the  model to directory
output_dir = Path(ROOT_DIR+'model_NER/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to /content/drive/MyDrive/TwitterSupport/model_NER


In [36]:
def camel_case_split(str):
    words = [[str[0]]]
  
    for c in str[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c))
        else:
            words[-1].append(c)
  
    return " ".join([''.join(word) for word in words])

In [39]:
camel_case_split("@AirlineMorocco")

'@Airline Morocco'

In [38]:
import re
import string

def clean_text( txt):

      txt = " ".join([camel_case_split(t) for t in txt.split(" ")])
      txt = re.sub(r"(?<=\w)nt", "not",txt)
      txt = re.sub(r'\W', ' ', str(txt))
      txt = txt.translate(str.maketrans('', '', string.punctuation))
      txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)
      txt = re.sub(r'\s+', ' ', txt, flags=re.I)
      txt = re.sub(r"(http\S+|http)", "", txt)
      return txt

In [40]:
clean_text("@AirlineMorocco why are your first fares in May over three times more than other carriers when all seats are available to select???")

' Airline Morocco why are your first fares in May over three times more than other carriers when all seats are available to select '

In [42]:
doc = nlp('Airline Morocco why are your first fares in May over three times more than other carriers when all seats are available to select ')
print([(X.text, X.label_) for X in doc.ents])

[('Airline Morocco', 'ORG'), ('May', 'ORG')]


In [43]:
nlp = en_core_web_sm.load()

In [45]:
doc = nlp(' Airline Morocco why are your first moroccan airline fares in May over three times more than other carriers when all seats are available to select ')
print([(X.text, X.label_) for X in doc.ents])

[('Airline Morocco', 'PERSON'), ('first', 'ORDINAL'), ('moroccan', 'NORP'), ('May', 'DATE'), ('three', 'CARDINAL')]


In [46]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import re
import string
from pathlib import Path

class NameEntities:

  def __init__(self):

      self.nlp = en_core_web_sm.load()
      output_dir = Path(ROOT_DIR+'model_NER/')
      print("Loading from", output_dir)
      self.nlp_updated = spacy.load(output_dir)

  def clean_text(self, txt):

      txt = " ".join([self.camel_case_split(t) for t in txt.split(" ")])
      txt = re.sub(r"(?<=\w)nt", "not",txt) #change don't to do not cna't to cannot 
      txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie 
      txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations 
      txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
      txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
      txt = re.sub(r"(http\S+|http)", "", txt) # remove links 
      return txt


  def camel_case_split(self, str):
    words = [[str[0]]]
  
    for c in str[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c))
        else:
            words[-1].append(c)
  
    return " ".join([''.join(word) for word in words])

  
  def get_Entities(self, text):
      text = self.clean_text(text)
      doc = self.nlp_updated(text)
      labels = [(X.text, X.label_) for X in doc.ents]

      doc = self.nlp(text)
      labels_norm = [(X.text, X.label_) for X in doc.ents]
      labels.extend(labels_norm)

      return labels



In [47]:
ner = NameEntities()

Loading from /content/drive/MyDrive/TwitterSupport/model_NER


In [50]:
ner.get_Entities("Project_test is a good airline")

[('Projecttest', 'ORG')]