In [6]:
from re import L
from typing import Type
import xml.etree.cElementTree as ET
import spacy 
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
import pandas as pd
import re
import pytextrank 
import en_core_web_sm
import numpy as np
import random
import io
from collections import Counter

#create a dataframe to store the results of the experiments 
cols = ["dataset","model", "precision", "recall", "f1"]
df = pd.DataFrame(columns =cols)

#Shared functions 
def SaveModelResults(df, dataset, model, results):
    """Use this function to add the results of an experiments to the dataframe"""
    row = pd.Series([dataset,model,results['p'],results['r'],results['f']], index = df.columns)
    df = df.append(row, ignore_index=True)

    return df

def TrainAspectModel(dataset, modeloutput, iterations, drop):
    # Train Aspect detection model using Spacy
    nlp = spacy.blank("en")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
        # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label("ASPECT")
    sizes = compounding(1.0, 4.0, 1.001)

    optimizer = nlp.begin_training()
    for i in range(iterations):
        random.shuffle(dataset)
        batches = minibatch(dataset, size=sizes)
        losses = {}
        
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=drop, losses=losses)

        print("Losses", losses)

    nlp.to_disk(modeloutput,)

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [7]:
#Open the hotels xml file
with io.open("hotels.xml", 'r', encoding='utf-8-sig') as f:
    contents = f.read()
    escape_illegal_xml_characters = lambda x: re.sub(u'&', 'and', x)
    contents = escape_illegal_xml_characters(contents)
    tree = ET.fromstring(contents)

with io.open("hotels.xml", 'r', encoding='utf-8-sig') as f:
    contents = f.read()
    escape_illegal_xml_characters = lambda x: re.sub(u'&', 'and', x)
    contents = escape_illegal_xml_characters(contents)
    tree = ET.fromstring(contents)

reviews_output_train = []
reviews_output_test = []

train_term_list = []
test_term_list = []

for fold in tree:
    id = int(fold.attrib["id"])
    for instance in fold:
        sample = ()
        sentext = instance[0].text.lower()
        annotations = {}
        annotations['entities'] = []
        LABEL = "ASPECT"
        overlap = False   

        for aspect in instance[1]:
            aspect_text = aspect.attrib["content"].lower()
            try:
                loc = aspect_text, re.search(aspect_text, sentext).span()
                first = loc[1]

                for a in annotations['entities']:
                    #Make sure that the none of the entities overlap because 
                    # Spacy does not allow overlapping entities 
                    second = a
                    if (int(first[0]) < int(second[1]) and int(first[1]) > int(second[0])):
                        overlap = True
                        break
                    else:
                        overlap = False   

                if not overlap:
                    annotations['entities'].append((loc[1][0], loc[1][1], LABEL))
                    if id < 8:
                        train_term_list.append(loc[0])
                    else:
                        test_term_list.append(loc[0])
            except:
                continue
        
        if id < 8:
            sample = (sentext, annotations)
            reviews_output_train.append(sample)

        else:
            sample = (sentext, annotations['entities'])
            reviews_output_test.append(sample)

TRAIN_DATA_HOTELS = reviews_output_train
TEST_DATA_HOTELS = reviews_output_test

TRAIN_TERMS_HOTELS = train_term_list
TEST_TERMS_HOTELS = train_term_list

In [8]:
#Read the laptops XML File
def XMLToSpacyFormat(XMLFile, Type="Train"):
    tree = ET.parse(XMLFile)
    reviews_input = tree.getroot()
    reviews_output = []
    termlist = []

    for sentence in reviews_input:

        sample = ()
        sentext = sentence[0].text.lower()
        annotations = {}
        annotations['entities'] = []
        LABEL = "ASPECT"

        for aspectTerm in sentence.iter("aspectTerms"):
            for term in aspectTerm:
                termlist.append(term.attrib["term"].lower())
                startTerm = int(term.attrib["from"])
                endTerm = int(term.attrib["to"])
                annotations['entities'].append((startTerm, endTerm,  LABEL))

            if Type=="Train":
                sample = (sentext, annotations)
            else:
                sample = (sentext, annotations['entities'])

            reviews_output.append(sample)

    return reviews_output, termlist

TRAIN_DATA_LAPTOPS, TRAIN_TERMS_LAPTOPS = XMLToSpacyFormat("Laptop_Train_v2.xml", "Train")
TEST_DATA_LAPTOPS, TEST_TERMS_LAPTOPS = XMLToSpacyFormat("Laptops_Test_Data_phaseB.xml", "Test")

TRAIN_DATA_RESTAURANTS, TRAIN_TERMS_RESTAURANTS = XMLToSpacyFormat("Restaurants_Train_v2.xml", "Train")
TEST_DATA_RESTAURANTS, TEST_TERMS_RESTAURANTS = XMLToSpacyFormat("Restaurants_Test_Data_phaseB.xml", "Test")




In [3]:
# Training can take multiple hours so the training step is commented out
# Pretrained models are loaded in a later step
# TrainAspectModel(TRAIN_DATA_HOTELS,"./models_hotels_drop", 30, 0.35)
# TrainAspectModel(TRAIN_DATA_LAPTOPS,"./models_laptops_drop", 30, 0.35)
# TrainAspectModel(TRAIN_DATA_RESTAURANTS,"./models_restaurants_drop", 30, 0.35)

In [9]:
#load and score the training models 
hotel_trained_model = spacy.load("./models_hotels_drop")
results = evaluate(hotel_trained_model, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "hotels","trained", results)

laptop_trained_model = spacy.load("./models_laptops_drop")
results = evaluate(laptop_trained_model, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "laptops","trained", results)

restaurant_trained_model = spacy.load("./models_restaurants_drop")
results = evaluate(restaurant_trained_model, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "restaurants","trained", results)



In [10]:
# Use the rule based matcher to create a rule
# to extract nouns and assign them as ents
# The same model can be used for all datasets 
rule_based_model = spacy.load("en_core_web_sm")
ruler = EntityRuler(rule_based_model)
ruler.add_patterns([{"label": "ASPECT", "pattern": [{'POS': 'NOUN'}] }])
rule_based_model.add_pipe(ruler)

#Use the rule-based model on all of the datasets and save the results
results = evaluate(rule_based_model, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "hotels","rules", results)

results = evaluate(rule_based_model, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "laptops","rules", results)

results = evaluate(rule_based_model, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "restaurants","rules", results)

blocked_terms = ["hotel"]
component_cfg = {"RemoveWords": {'terms': blocked_terms}}

def RemoveWords(doc, blocked_terms=["hotel","laptop","computer","macbook","restaurant"]):
  new_ents = ()
  for ent in doc.ents:
    if str(ent) not in blocked_terms:
      new_ents += (ent,)
  doc.ents = new_ents
  return doc

rule_based_model2 = spacy.load("en_core_web_sm")
ruler = EntityRuler(rule_based_model2)
ruler.add_patterns([{"label": "ASPECT", "pattern": [{'POS': 'NOUN'}] }])
rule_based_model2.add_pipe(ruler)
rule_based_model2.add_pipe(RemoveWords)




In [11]:
#Term-based model

#Create a list of all unique aspects from the training data
TRAIN_TERMS_HOTELS_UNIQUE = set(TRAIN_TERMS_HOTELS)

nlp_hotel_ruler=spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp_hotel_ruler)

#Add one pattern for each term in the list of aspects
for t in TRAIN_TERMS_HOTELS:
    pattern=[{"label": "ASPECT", "pattern": t}]
    ruler.add_patterns(pattern)

nlp_hotel_ruler.add_pipe(ruler)

results = evaluate(nlp_hotel_ruler, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]

df = SaveModelResults(df, "hotels","terms", results)



In [12]:
#Term-based model

#Create a list of all unique aspects from the training data
TRAIN_TERMS_LAPTOPS_UNIQUE = set(TRAIN_TERMS_LAPTOPS)

nlp_laptop_ruler=spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp_laptop_ruler)

#Add one pattern for each term in the list of aspects
for t in TRAIN_TERMS_LAPTOPS_UNIQUE:
    pattern=[{"label": "ASPECT", "pattern": t}]
    ruler.add_patterns(pattern)

nlp_laptop_ruler.add_pipe(ruler)

results = evaluate(nlp_laptop_ruler, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]

df = SaveModelResults(df, "laptops","terms", results)


In [13]:
#Term-based model

#Create a list of all unique aspects from the training data
TRAIN_TERMS_RESTAURANTS_UNIQUE = set(TRAIN_TERMS_HOTELS)

nlp_restaurant_ruler=spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp_laptop_ruler)

#Add one pattern for each term in the list of aspects
for t in TRAIN_TERMS_RESTAURANTS_UNIQUE:
    pattern=[{"label": "ASPECT", "pattern": t}]
    ruler.add_patterns(pattern)

nlp_restaurant_ruler.add_pipe(ruler)

results = evaluate(nlp_restaurant_ruler, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]

df = SaveModelResults(df, "restaurants","terms", results)

In [14]:
#Use the rule-based model on all of the datasets and save the results
results = evaluate(rule_based_model2, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "hotels","rules2", results)

results = evaluate(rule_based_model2, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "laptops","rules2", results)

results = evaluate(rule_based_model2, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df = SaveModelResults(df, "restaurants","rules2", results)


In [15]:
#output performance metrics
df

Unnamed: 0,dataset,model,precision,recall,f1
0,hotels,trained,70.771757,76.283186,73.424191
1,laptops,trained,71.451104,71.451104,71.451104
2,restaurants,trained,78.04878,76.460177,77.246312
3,hotels,rules,24.8394,82.123894,38.142211
4,laptops,rules,23.266023,41.798107,29.892837
5,restaurants,rules,42.251886,64.424779,51.033999
6,hotels,terms,48.17898,81.946903,60.68152
7,laptops,terms,57.096774,55.835962,56.45933
8,restaurants,terms,58.452722,36.106195,44.63895
9,hotels,rules2,26.550725,81.061947,40.0


In [16]:

def ExtractTerms(model, dataset):
    """Extract a list of entities that are created by a model and a dataset. 
    This used to understand what kind of entities are extracted to debug the model."""
    terms = []
    for r in dataset:
        doc = model(r[0])
        ents = doc.ents
        for e in ents:
            terms.append(str(e))
    return terms

def CountTerms(term_list):
    """Take a list of terms and return the counts of the most common terms as a 
    Pandas series."""
    # Creating the index 
    counts = pd.Index(term_list, name ='Terms').value_counts()
    return counts



In [17]:
# user to explore is a specific term is extracted
# terms = ExtractTerms(restaurant_hotel_trained_model, TEST_DATA_LAPTOPS)
# dd = CountTerms(terms)
# dd.index.contains('service')
terms = ExtractTerms(restaurant_trained_model, TEST_DATA_RESTAURANTS)
CountTerms(terms)[:10]

food          126
service        75
atmosphere     24
staff          21
menu           18
sushi          18
prices         17
meal           14
drinks         13
waiter         13
Name: Terms, dtype: int64

In [19]:
#Create blended models 
#1 Use the models trained on one dataset and evaulate it using other datasets
#The goal is to see how if the model that serve as a generic model that works
#Well on other datasets 

df_trainon1_testonothers = pd.DataFrame(columns =cols)

In [20]:
results = evaluate(hotel_trained_model, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "laptops","trained on hotels", results)

results = evaluate(hotel_trained_model, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "restaurants","trained on hotels", results)

results = evaluate(laptop_trained_model, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "hotels","trained on laptops", results)

results = evaluate(laptop_trained_model, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "restaurants","trained on laptops", results)

results = evaluate(restaurant_trained_model, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "hotels","trained on restaurants", results)

results = evaluate(restaurant_trained_model, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df_trainon1_testonothers = SaveModelResults(df_trainon1_testonothers, "laptops","trained on restaurants", results)

df_trainon1_testonothers

Unnamed: 0,dataset,model,precision,recall,f1
0,laptops,trained on hotels,36.567164,7.728707,12.760417
1,restaurants,trained on hotels,72.813688,33.893805,46.256039
2,hotels,trained on laptops,34.702259,29.911504,32.129278
3,restaurants,trained on laptops,53.597122,26.371681,35.349941
4,hotels,trained on restaurants,46.153846,58.40708,51.5625
5,laptops,trained on restaurants,39.473684,21.293375,27.663934


In [22]:
#Build datasets that are combinations of two different datasets 

TRAIN_DATA_LAPTOPS_HOTELS = TRAIN_DATA_LAPTOPS + TRAIN_DATA_HOTELS
TRAIN_DATA_RESTAURANTS_HOTELS = TRAIN_DATA_RESTAURANTS + TRAIN_DATA_HOTELS
TRAIN_DATA_RESTAURANTS_LAPTOPS = TRAIN_DATA_RESTAURANTS + TRAIN_DATA_LAPTOPS

In [24]:
# Train models on 2 datasets 
# Training can take multiple hours so the training step is commented out
# Pretrained models are loaded in a later step
# TrainAspectModel(TRAIN_DATA_LAPTOPS_HOTELS,"./models_hotels_laptops_drop", 30, 0.35)
# TrainAspectModel(TRAIN_DATA_RESTAURANTS_HOTELS,"./models_restaurants_hotels_drop", 30, 0.35)
# TrainAspectModel(TRAIN_DATA_RESTAURANTS_LAPTOPS,"./models_restaurants_laptops_drop", 30, 0.35)

In [25]:
#Create blended models 
#Train a model on two datasets and apply it to a third data set

df_trainon2_testonother = pd.DataFrame(columns =["dataset","trained on","precision","recall","f1"])

#load and score the training models 
hotel_laptop_trained_model = spacy.load("./models_hotels_laptops_drop")
results = evaluate(hotel_laptop_trained_model, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df_trainon2_testonother = SaveModelResults(df_trainon2_testonother, "restaurant","hotel+laptop", results)

restaurant_laptop_trained_model = spacy.load("./models_restaurants_laptops_drop")
results = evaluate(restaurant_laptop_trained_model, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df_trainon2_testonother = SaveModelResults(df_trainon2_testonother, "hotel","restaurant+laptop", results)

restaurant_hotel_trained_model = spacy.load("./models_restaurants_hotels_drop")
results = evaluate(restaurant_hotel_trained_model, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df_trainon2_testonother = SaveModelResults(df_trainon2_testonother, "laptops","restaurant+hotel", results)

In [27]:
df_trainon2_testonother

Unnamed: 0,dataset,trained on,precision,recall,f1
0,restaurant,hotel+laptop,72.340426,33.097345,45.415908
1,hotel,restaurant+laptop,48.547718,62.123894,54.503106
2,laptops,restaurant+hotel,41.935484,12.302839,19.02439


In [35]:
terms = ExtractTerms(restaurant_hotel_trained_model, TEST_DATA_LAPTOPS)
CountTerms(terms)[60:80]

settings                 1
apple removed the dvd    1
creation                 1
simplicity               1
quality                  1
customer service         1
priced                   1
graphics chipset         1
computers                1
seamless                 1
baterry                  1
network lights           1
customize setting        1
graphics stuff           1
laptop                   1
stable                   1
room                     1
acers                    1
experienced              1
retina                   1
Name: Terms, dtype: int64

In [28]:
#Create blended models 
#Train a model on 3 datasets and apply it to each of the test data sets
#Select the same number of samples from each dataset to minimize bias

random.shuffle(TRAIN_DATA_LAPTOPS)
random.shuffle(TRAIN_DATA_HOTELS)
random.shuffle(TRAIN_DATA_RESTAURANTS)


TRAIN_DATA_ALL = TRAIN_DATA_LAPTOPS[:1488] + TRAIN_DATA_HOTELS[:1488] + TRAIN_DATA_RESTAURANTS[:1488]

# Training can take multiple hours so the training step is commented out
# Pretrained models are loaded in a later step
# TrainAspectModel(TRAIN_DATA_ALL,"./models_all_drop", 30, 0.35)


In [29]:
df_trainon3_testonother = pd.DataFrame(columns =["dataset","trained on","precision","recall","f1"])

#load and score the training models 
trained_model_all = spacy.load("./models_all_drop")
results = evaluate(trained_model_all, TEST_DATA_RESTAURANTS)["ents_per_type"]["ASPECT"]
df_trainon3_testonother = SaveModelResults(df_trainon3_testonother, "restaurant","all", results)

results = evaluate(trained_model_all, TEST_DATA_HOTELS)["ents_per_type"]["ASPECT"]
df_trainon3_testonother = SaveModelResults(df_trainon3_testonother, "hotel","all", results)

results = evaluate(trained_model_all, TEST_DATA_LAPTOPS)["ents_per_type"]["ASPECT"]
df_trainon3_testonother = SaveModelResults(df_trainon3_testonother, "laptops","all", results)

df_trainon3_testonother

Unnamed: 0,dataset,trained on,precision,recall,f1
0,restaurant,all,77.276909,74.336283,75.778078
1,hotel,all,60.25825,74.336283,66.561014
2,laptops,all,73.848684,70.820189,72.302738


In [57]:
#Sample using displacy to visualize aspects 
import spacy
from spacy import displacy

doc = laptop_trained_model("I charge it at night and skip taking the cord with me because of the good battery life.")
displacy.render(doc, style="ent", jupyter=True, )
