In [None]:
import numpy as np
import pysolr

np.random.seed(42)
import spacy
import sys
import pandas as pd


In [None]:
# In case your sys.path does not contain the base repo, cd there.
print(sys.path)
%cd '~/ml-solr-course'

In [None]:
nlp = None # Load the en_core_web_trf from spacy. This is a transformer model like GPT-2
dataset = pd.read_csv('dataset/new_york_reduced.csv')[:100][["id", "name", "description", "neighbourhood_cleansed", "property_type"]]
dataset.head()

In [None]:
first_description = dataset["description"].iloc[0]
first_description



In [None]:
# Evaluate the nlp method in the first)_description to see what it returns
doc = None

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Each document will have one or many entities. Each having a specific type (LOCATION, ORG, DATE, NUMBER, GPE, etc...) a starting character, and an ending character to define where it is.

If we have a field that is too expensive to store or index, we could only index their main named entities by enriching the index with these fields at index time, and indexing those alone.

In [None]:
tags = []

# We will create a function that will get the descriptions one by one and append the tags to the tags cache
def update_caches(document):
    doc = None  # Run nlp on the document
    inner_tags = []
    # Append each entity text property to the inner_tags list
    tags_to_append = inner_tags if len(inner_tags) else None
    tags.append(tags_to_append)

update_caches = np.vectorize(update_caches)  # This is for speedup

In [None]:
_ = update_caches(dataset[["description"]].values)  # -> As you can tell this takes a lot!

A good strategy while we wait, is notice that NER is both hard and slow, so is best to apply and enrich while we index, since that process already takes time.

Or find a vectorized implementation of spacy's nlp (I haven't found any) or find if we can cythonize it (haven't tried honestly).

In [None]:
dataset = pd.concat([dataset, pd.Series(tags, name="tags")], axis=1)
dataset["id"] = pd.to_numeric(dataset["id"], downcast='integer')
dataset = dataset[:100]

In [None]:
# We save the enriched dataset to index it in a new core.
dataset.to_csv("./4-ner/lab9/expanded_dataset.csv", index=False)