In [2]:
import pandas as pd
import numpy as np
import spacy
import json

## Helper Functions

In [3]:
def highlight_entities(text, ents):
    blue_bold_char = '\033[94m\033[1m'
    back_to_normal = '\033[0m'

    previous_end=0
    text_h =""
    for ent in ents["entities"]:

        start=ent[0]
        end=ent[1]
        ent_val = text[start:end]
        text_h = text_h + text[previous_end:start]+blue_bold_char +ent_val+back_to_normal 
        previous_end = end
    text_h = text_h+text[previous_end:]
    print(text_h)

In [4]:
def text_to_entities(text, nlp):
    doc = nlp(text)   
    ents = doc.ents
    entites = []
    for ent in ents:
        ent_text = text[ent.start_char:ent.end_char]
        entites.append((ent.start_char, ent.end_char, ent.label_, ent_text))
        
    output = (text, {"entities":entites})
    highlight_entities(output[0], output[1])
    return output

In [5]:
def load_spacy_model(model_path, base_model = "en_core_web_md"):

    nlp = spacy.load(base_model)
    file = open(f'{model_path}/bytes_data.bin',"rb")
    bytes_data = file.read()
    config = nlp.config
    lang_cls = spacy.util.get_lang_class("en")
    nlp = lang_cls.from_config(config)
    nlp = nlp.from_disk(f'{model_path}')
    
    return nlp

In [6]:
def save_entities_to_json(ent_result, output_path, id = None):
    """ Saves text and found entities to a JSON file.                                                                                                                        

    Args:
        ent_result: result from text_to_entities function - tuple (text, {"entities": [...]})
        output_path: path to output JSON file
    """
    text = ent_result[0]
    entities = ent_result[1]["entities"]
    
    entities_list = []
    for start, end, label, entity_text in entities:
        entities_list.append({
            #"start": start,
            #"end": end,
            "label": label,
            "text": entity_text
        })
    
    output_data = {
        "text": text,
        "entities": entities_list
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)
    
    print(f"Saved to: {output_path}")
    return output_data

## Load saved model

In [7]:
geo_model = load_spacy_model("../models/ner_geo")

In [8]:
text = "excellence just ok boyfriend stayed hotel 8/11 time staying inclusive adults resort, peaceful relaxing vacation, property clean staff friendly, brush spanish problem communicating staff, beach absolutely beautiful, utopia, highly recommend taking helicoptor airport, took hour half hotel 2 hours airport bus tropical storm came day lot pot holes roads hotel.the food just ok ask premium alcohol cheap stuff drinks, bring buy bug spray, got eaten alive, loved saxaphone player played lobby evening, played jazz music romantic, liked music bought 2 cd 20.00, got home tried play cd players n't work, sent emails domingo hotel concierge, travel agent called hotel, told mail new cd, weeks ago, nice vacation,  "

In [9]:
ent = text_to_entities(text, geo_model)
for start, end, label, entity_text in ent[1]["entities"]:
    print(f"{label}: {entity_text}")

excellence just ok boyfriend stayed hotel 8/11 time staying inclusive adults resort, peaceful relaxing vacation, property clean staff friendly, brush [94m[1mspanish[0m problem communicating staff, beach absolutely beautiful, utopia, highly recommend taking helicoptor airport, took hour half hotel 2 hours airport bus tropical storm came day lot pot holes roads hotel.the food just ok ask premium alcohol cheap stuff drinks, bring buy bug spray, got eaten alive, loved saxaphone player played lobby [94m[1mevening[0m, played jazz music romantic, liked music bought 2 cd 20.00, got home tried play cd players n't work, sent emails [94m[1mdomingo hotel[0m concierge, travel agent called hotel, told mail new cd, weeks ago, nice vacation,  
gpe: spanish
tim: evening
org: domingo hotel


In [17]:
save_entities_to_json(ent, "../data/output/NER_output.json") 

Saved to: ../data/output/NER_output.json


{'text': "excellence just ok boyfriend stayed hotel 8/11 time staying inclusive adults resort, peaceful relaxing vacation, property clean staff friendly, brush spanish problem communicating staff, beach absolutely beautiful, utopia, highly recommend taking helicoptor airport, took hour half hotel 2 hours airport bus tropical storm came day lot pot holes roads hotel.the food just ok ask premium alcohol cheap stuff drinks, bring buy bug spray, got eaten alive, loved saxaphone player played lobby evening, played jazz music romantic, liked music bought 2 cd 20.00, got home tried play cd players n't work, sent emails domingo hotel concierge, travel agent called hotel, told mail new cd, weeks ago, nice vacation,  ",
 'entities': [{'label': 'gpe', 'text': 'spanish'},
  {'label': 'tim', 'text': 'evening'},
  {'label': 'org', 'text': 'domingo hotel'}]}

## Extract NER from customer_surveys_hotels_1k.json

In [12]:
#ToDo3: read data from file ../data/customer_surveys_hotels_1k.json , read the field "review" and extract entities using geo_model, save results to ../data/customer_surveys_hotels_1k_ner.json Save id, text and entities found

# Read input data
with open('../data/customer_surveys_hotels_1k.json', 'r', encoding='utf-8') as f:
    surveys = json.load(f)

print(f"Loaded {len(surveys)} reviews")

Loaded 1000 reviews


In [13]:
# Process all reviews and extract entities
results = []

for survey in surveys:
    review_text = survey["review"]
    doc = geo_model(review_text)
    
    # Extract entities
    entities = []
    for ent in doc.ents:
        entities.append({
            "label": ent.label_,
            "text": ent.text
        })
    
    results.append({
        "id": survey["id"],
        "text": review_text,
        "entities": entities
    })

print(f"Processed {len(results)} reviews")
print(f"Example: {results[0]}")

Processed 1000 reviews
Example: {'id': '7a823fbc-e97b-4e8b-8fb0-f60dfb79a8ef', 'text': "hotel america nice hotel good location stayed 3 nights hotel america late december, rooms modern nice, really liked location hotel, located 3 blocks main area, excellent location base stay explore interesting parts city, able walk las ramblas neighborhoods gothic district, walked sacred familia cathedral no 15 minutes morning, breakfast adequate run things wait long breakfast, negatives street noise pretty loud room, 5th floor difficult sleep, complicated fact air conditioning not heating mode hotel halls warm floor, meant order sleep open windows cool room meant noise worse, methodology evolved opened windows evening room cold closed bed, problem beds did n't sheet nice comforter, means comforter exposed hard regulate sleeping comfort temperature, set typical spain hard fault hotel america, just aware,  ", 'entities': [{'label': 'org', 'text': 'hotel america'}, {'label': 'org', 'text': 'hotel ameri

In [14]:
# Save results to JSON file
output_path = '../data/output/NER_customer_surveys_hotels_1k.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Saved {len(results)} results to {output_path}")

Saved 1000 results to ../data/output/NER_customer_surveys_hotels_1k.json
