<a href="https://colab.research.google.com/github/YanjunLin-Andrie/NLP_SpaCy_eBay/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
# Import all libraries and dependencies

import pandas as pd
import spacy
import json
import random
import numpy as np
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from collections import Counter
from string import punctuation
from random import shuffle
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

In [51]:
# Import training dataset
from google.colab import files
# uploaded = files.upload()
#read the file
ttt = pd.read_csv('Train_Tagged_Titles.tsv', on_bad_lines = 'skip', sep = '\t')
ttt = ttt.replace(np.nan, 'Brand', regex=True)
#read the file
lt_sm = pd.read_csv('Listing_Titles_sm.tsv')

In [52]:
# File i/o
def load_data(file):
  with open(file, 'r', encoding = 'utf-8') as f:
    data = json.load(f)
  return (data)

def save_data(file, data):
  with open(file, 'w', encoding = 'utf-8') as f:
    json.dump(data, f, indent = 4)

## Use entity ruler to create custom NER model and save to file

In [53]:
# Get all of the unique tags from ttt dataframe and change them to a list format
all_tags = ttt["Tag"].unique().tolist()

# Create an empty list to collect all the patterns of the entity
patterns = []
# Get all of the tag names
for tag in all_tags:
  # Save list of Tokens under the tag name
  items = ttt["Token"].loc[ttt["Tag"] == f"{tag}"].tolist()
  # Loops through created list of Tokens
  for item in items:
    # Adds the new pattern to pattens list
    patterns.append({'label': f'{tag}', 'pattern': item})

In [54]:
# Create model
def generate_rules(patterns):

  # Create a blank English model
  nlp = spacy.blank('en')
  # Create the entity ruler and add to entity pipeline
  ruler = nlp.add_pipe('entity_ruler')
  # add patterns list to model
  ruler.add_patterns(patterns)

  # Save model with patterns
  nlp.to_disk('./ebay_ner')

# call function
generate_rules(patterns)

## Test on the created NER model

In [55]:
# Load model
nlp = spacy.load('./ebay_ner')

In [56]:
# Collecting testing dataset and convert testing data to list format
sentance_list = lt_sm['Title'].tolist()
test = sentance_list[35:45]
test

['Marc Jacobs Black Leather Crossbody Bag ( M0013555-001 ) $ 275.00 NWT',
 'Mimco Echo tote bag - Used but in New condition',
 'Laurel Burch purse hand bag cut out cats geometric 15x9x4 brown',
 "VERSACE JEANS women 's CLUTCH E1VVBBUX_71494",
 'Brighton Handbag Shoulder Bag Tote Raised Flowers Vines Enamel Butterfly',
 '2021 Solid Jacquemus Crocodile Pattern Mini Crossbody Handbag Shoulder Bag',
 'Fossil Small Green Leather Crossbody Shoulder Bag Brass Adjustable w / key Mint',
 'COACH F10909 Soho Black Leather Hobo Shoulder satchel FLAWLESS !',
 '( Velours-Grün ) - Bags4Less Messenger Bag , Velours-Grün ( Green ) - F3151_230 . Fr',
 'Tesco Cute Peter Rabbit Beatrix Potter Shopping Tote Bag New']

In [57]:
# Pass test data into built nlp model to extract entities and tags
for i in test:
    doc = nlp(i)
    results = []
    for ent in doc.ents:
      results.append([ent.text,ent.label_])
    print(results)

[['Marc', 'Product Line'], ['Jacobs', 'Brand'], ['Black', 'Brand'], ['Leather', 'Material'], ['Crossbody', 'Type'], ['Bag', 'Type'], ['(', 'No Tag'], ['-', 'No Tag'], ['001', 'No Tag'], [')', 'No Tag'], ['$', 'No Tag'], ['NWT', 'No Tag']]
[['tote', 'Brand'], ['bag', 'Brand'], ['-', 'No Tag'], ['Used', 'Obscure'], ['in', 'Obscure'], ['New', 'No Tag'], ['condition', 'No Tag']]
[['Laurel', 'Pattern'], ['Burch', 'Brand'], ['purse', 'Type'], ['hand', 'Brand'], ['bag', 'Obscure'], ['out', 'No Tag'], ['brown', 'Color']]
[['VERSACE', 'Brand'], ['JEANS', 'Brand'], ['women', 'Department'], ["'s", 'Brand'], ['CLUTCH', 'Brand']]
[['Brighton', 'Brand'], ['Handbag', 'Type'], ['Shoulder', 'Handle Style'], ['Bag', 'Brand'], ['Tote', 'Brand'], ['Raised', 'No Tag'], ['Flowers', 'Brand'], ['Vines', 'Pattern'], ['Enamel', 'Brand'], ['Butterfly', 'Pattern']]
[['2021', 'No Tag'], ['Solid', 'No Tag'], ['Crocodile', 'Pattern'], ['Pattern', 'No Tag'], ['Mini', 'Model'], ['Crossbody', 'Brand'], ['Handbag', 'Typ

In [49]:
# Optional: save tested data
# save_data('./test.json', results)

## Create NER training set

In [162]:
# Load model
nlp = spacy.load('./ebay_ner') 

TRAIN_DATA = []
for item in ttt['Title'].unique().tolist():
  doc = nlp(item)
  entities = []
  for ent in doc.ents:
    entities.append((ent.text, ent.label_)) 
  if len(entities) > 0:
    TRAIN_DATA = [item, {'entities': entities}]
    print(TRAIN_DATA)
  # Save training data
  save_data('./ebay_training_data.json', TRAIN_DATA)

  # Right format, but json file only has one set of data

['LOUIS VUITTON M40096 Handbag Priscilla Multi-color canvas Multi-color canvas', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('M40096', 'MPN'), ('Handbag', 'Type'), ('Priscilla', 'Model'), ('Multi-color', 'Color'), ('canvas', 'Brand'), ('Multi-color', 'Color'), ('canvas', 'Brand')]}]
['LOUIS VUITTON Petit Noe Drawstring Shoulder Bag Monogram Leather M42226 39SD442', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('Petit', 'Brand'), ('Noe', 'Brand'), ('Drawstring', 'Closure'), ('Shoulder', 'Handle Style'), ('Bag', 'Brand'), ('Monogram', 'Brand'), ('Leather', 'Trim Material'), ('M42226', 'MPN'), ('39SD442', 'No Tag')]}]
['LOUIS VUITTON Damier Azur Pochette Bosphore Shoulder Bag N51112 LV Auth yt523', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('Damier', 'Brand'), ('Azur', 'Color'), ('Pochette', 'Model'), ('Bosphore', 'Model'), ('Shoulder', 'Brand'), ('Bag', 'Type'), ('N51112', 'MPN'), ('LV', 'MPN'), ('Auth', 'No Tag'), ('yt523', 'No 

In [151]:
TRAIN_DATA = []
def train_data(model, text):
  for item in text:
    doc = nlp(item)
    entities = []
    for ent in doc.ents:
      entities.append((ent.text, ent.label_))
    if len(entities) > 0:
      TRAIN_DATA = [item, {'entities': entities}]
      print(TRAIN_DATA)
  return (TRAIN_DATA)  
    


# Load model
nlp = spacy.load('./ebay_ner')  

# Call function
train_data(nlp, ttt['Title'].unique().tolist())

# Save training data
save_data('./ebay_training_data.json', TRAIN_DATA)

print(len(TRAIN_DATA))

['LOUIS VUITTON M40096 Handbag Priscilla Multi-color canvas Multi-color canvas', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('M40096', 'MPN'), ('Handbag', 'Type'), ('Priscilla', 'Model'), ('Multi-color', 'Color'), ('canvas', 'Brand'), ('Multi-color', 'Color'), ('canvas', 'Brand')]}]
['LOUIS VUITTON Petit Noe Drawstring Shoulder Bag Monogram Leather M42226 39SD442', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('Petit', 'Brand'), ('Noe', 'Brand'), ('Drawstring', 'Closure'), ('Shoulder', 'Handle Style'), ('Bag', 'Brand'), ('Monogram', 'Brand'), ('Leather', 'Trim Material'), ('M42226', 'MPN'), ('39SD442', 'No Tag')]}]
['LOUIS VUITTON Damier Azur Pochette Bosphore Shoulder Bag N51112 LV Auth yt523', {'entities': [('LOUIS', 'Product Line'), ('VUITTON', 'Brand'), ('Damier', 'Brand'), ('Azur', 'Color'), ('Pochette', 'Model'), ('Bosphore', 'Model'), ('Shoulder', 'Brand'), ('Bag', 'Type'), ('N51112', 'MPN'), ('LV', 'MPN'), ('Auth', 'No Tag'), ('yt523', 'No 

In [146]:
TRAIN_DATA = []
def train_data(model, text):
  for item in text:
    doc = nlp(item)
    for ent in doc.ents:
      entities = [[ent.text, ent.label_]]
      for entity in entities:
        TRAIN_DATA.append(entity)   

# Load model
nlp = spacy.load('./ebay_ner')  
# Call function
train_data(nlp, ttt['Title'].unique().tolist())
# Save training data
save_data('./ebay_training_data.json', TRAIN_DATA)


print(len(TRAIN_DATA))

38327


## Train an NER model

In [154]:
# Load training dataset
TRAIN_DATA = load_data('./ebay_training_data.json')
TRAIN_DATA

['Botkier Sasha Medium Duffel Bag Coral Leather Zip Top Closure Retail $ 595',
 {'entities': [['Botkier', 'Brand'],
   ['Sasha', 'Model'],
   ['Medium', 'Size'],
   ['Duffel', 'Type'],
   ['Bag', 'Type'],
   ['Coral', 'Color'],
   ['Leather', 'Handle/Strap Material'],
   ['Zip', 'Closure'],
   ['Top', 'No Tag'],
   ['Closure', 'Brand'],
   ['Retail', 'No Tag'],
   ['$', 'No Tag'],
   ['595', 'No Tag']]}]

In [157]:
# Use saved TRAIN_DATA dataset to train a brand new spacy model
def train_spacy(data, iterations):
  TRAIN_DATA = data
  nlp = spacy.blank('en')
  if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last = True)
  for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
      ner.add_label(ent[1])   
  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(iterations):
      print('Starting iteration ' + str(itn))
      random.shuffle(TRAIN_DATA)
      losses = {}
      for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(text, annotations)
        print(text)
        nlp.update([example], drop = 0.2, sgd = optimizer, losses = losses)
      print(losses)
  return(nlp)
TRAIN_DATA = load_data('./ebay_training_data.json')
nlp = train_spacy(TRAIN_DATA, 30)
# nlp.to_disk('./ebay/ebay_ner_model')

ValueError: ignored

## Create train, test set to train NER model

In [None]:
output_path = './ebay'

In [None]:
import srsly
import typer
import warnings
from tqdm import tqdm
from pathlib import Path
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)

In [None]:
sentance_list = lt_sm['Title'].tolist()


In [None]:
train = sentance_list[:4993]
valid = sentance_list[4994:]

In [None]:
nlp = spacy.blank('en')
def create_training(TRAIN_DATA):
  db = DocBin()
  for text, annot in tqdm(TRAIN_DATA):
    doc = nlp.make_doc(text)
    ents = []
    for label in annot['entities']:
      span = doc.char_span(label = label, alignment_mode = 'contract')
      if span is None:
        print('Skipping entity')
      else:
        ents.append(span)
    doc.ents = ents
    db.add(doc)
  return(db)

In [None]:
train = create_training(train)
train.to_disk('./ebay/train.spacy')

In [None]:
valid = create_training(valid)
valid.to_disk('./ebay/valid.spacy')