In [1]:
import pandas as pd

# Load Data

In [2]:
artists = pd.read_csv("data/artists.txt", sep="\t", header=None, names=['name'])
events_title = pd.read_csv("data/event_titles.txt", sep="\t", header=None, names=['title'])

## Convert all text to lowercase

In [3]:
#artists = artists.applymap(lambda s:s.lower() if type(s) == str else s)
#events_title = events_title.applymap(lambda s:s.lower() if type(s) == str else s)

In [4]:
artists.head()

Unnamed: 0,name
0,King Crimson
1,Beginner
2,Marcellus Pittman
3,Beck
4,Gästeliste Geisterbahn


In [5]:
events_title.head(n=10)

Unnamed: 0,title
0,Jamey Johnson
1,Alex the Astronaut & Stella Donnelly - Adelaid...
2,Bad Bunny - La Nueva Religion Tour
3,Julien Baker at The Burl
4,SWING pres. Sam Paganini & Zøe
5,Dinosaur Jr Unoffical After Party
6,Ed Sheeran at CenturyLink Field!
7,Sally•Can't•Dance - 40th Anniversary of The Ro...
8,"Teksupport_OTB: Adriatique, Brian Cid"
9,Bert Visscher - Hij wordt vanzelf moe


# Random Sample For Manual Annotation

In [6]:
sample_events_title['title'] = events_title['title'].sample(n=50, random_state=1)

NameError: name 'sample_events_title' is not defined

In [None]:
sample_events_title.to_csv("data/sample_events_titles.csv")

# Explore Preprocessing

After doing a general analysis on the event title dataset, here are some observations:

- The character "-" seems to separate information. More precisely, it looks like it could be useful to identify the location of the event, tour names or slogans
- The character "&" and "," seems to be separating artists.

In [7]:
chars_to_remove = ['•']
split_chars = ['-', '&']

In [8]:
def remove_characters(s: str):
    for char in chars_to_remove:
        s.replace(char, "");
    return s

In [9]:
events_title = events_title.applymap(lambda s: remove_characters(s) if type(s) == str else s)

In [10]:
events_title.head()

Unnamed: 0,title
0,Jamey Johnson
1,Alex the Astronaut & Stella Donnelly - Adelaid...
2,Bad Bunny - La Nueva Religion Tour
3,Julien Baker at The Burl
4,SWING pres. Sam Paganini & Zøe


In [11]:
def split_text(s: str):
    result = []
    for char in split_chars:
        result.extend(s.split(char))
    return result

In [12]:
events_title['splitted_text'] = events_title.applymap(lambda s: split_text(s) if type(s) == str else s)

In [13]:
events_title

Unnamed: 0,title,splitted_text
0,Jamey Johnson,"[Jamey Johnson, Jamey Johnson]"
1,Alex the Astronaut & Stella Donnelly - Adelaid...,"[Alex the Astronaut & Stella Donnelly , Adela..."
2,Bad Bunny - La Nueva Religion Tour,"[Bad Bunny , La Nueva Religion Tour, Bad Bunn..."
3,Julien Baker at The Burl,"[Julien Baker at The Burl, Julien Baker at The..."
4,SWING pres. Sam Paganini & Zøe,"[SWING pres. Sam Paganini & Zøe, SWING pres. S..."
...,...,...
1222,Clowns Celebrate Love w/ Matchmaker Sasha Silb...,[Clowns Celebrate Love w/ Matchmaker Sasha Sil...
1223,Aurora & the Betrayers // Loco Club,"[Aurora & the Betrayers // Loco Club, Aurora ,..."
1224,Desolat with Loco Dice - All Night Terrace Party,"[Desolat with Loco Dice , All Night Terrace P..."
1225,Jeff Dunham in Ontario,"[Jeff Dunham in Ontario, Jeff Dunham in Ontario]"


In [14]:
events_title['splitted_text'].loc[1]

['Alex the Astronaut & Stella Donnelly ',
 ' Adelaide, SA',
 'Alex the Astronaut ',
 ' Stella Donnelly - Adelaide, SA']

# Explore NER Models

## Hugging Face

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [None]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

### Test

In [None]:
import pprint

example = "Six Nations Rugby - Italy vs England"

ner_results = nlp(example)
pprint.pprint(ner_results)

In [None]:
event_list = events_title.title.to_list()

In [None]:
results = []
ENTITIES = ['B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
for event in event_list:
    ner_results = nlp(event)
    if ner_results:
        locations = [r for r in ner_results if r.get('entity') in ENTITIES]
        if len(locations) > 0:
            results.append(locations)


In [None]:
print(f"Number of entities recognized: {len(results)}")
print(f"Number of events in the dataset: {len(event_list)}")

## Spacy

In [15]:
import spacy
from spacy import displacy

In [20]:
NER2 = spacy.load("en_core_web_lg")

In [17]:
NER = spacy.load("en_core_web_trf")

## Test

Using en_core_web_trf

In [18]:
raw_text = "Future • Special guests THEMM / Bubba & Friends • £2 Drinks before midnight"
text1 = NER(raw_text)
for word in text1.ents:
    print(word.text,word.label_)

THEMM PERSON
midnight TIME


Using en_core_web_lg

In [34]:
print("using en_core_web_trf")
raw_text = ["Cirque Du Soleil: Toruk - Il Primo Volo / Torino"]
for text in raw_text:
    text_ner = NER(text)
    for word in text_ner.ents:
        print(word.text,word.label_)
    print("\n")
  

print("using en_core_web_lg")
raw_text = ["Cirque Du Soleil: Toruk - Il Primo Volo / Torino"]
for text in raw_text:
    text_ner = NER2(text)
    for word in text_ner.ents:
        print(word.text,word.label_)
    print("\n")
        
print("using en_core_web_trf")
raw_text = ["Lil Pump STORY - Sat. January 13th",
            "Cattle Decapitation Australian Tour February 2018 w/ Psycroptic",
            "Random Rab at The Chop Shop w/ Edamame - March 8th 2018"]
for text in raw_text:
    text_ner = NER(text)
    for word in text_ner.ents:
        print(word.text,word.label_)
    print("\n")
  

print("using en_core_web_lg")
raw_text = ["Lil Pump STORY - Sat. January 13th",
            "Cattle Decapitation Australian Tour February 2018 w/ Psycroptic",
            "Random Rab at The Chop Shop w/ Edamame - March 8th 2018"]
for text in raw_text:
    text_ner = NER2(text)
    for word in text_ner.ents:
        print(word.text,word.label_)
    

using en_core_web_trf
Cirque Du Soleil WORK_OF_ART
Toruk ORG
Torino GPE


using en_core_web_lg
Cirque Du Soleil ORG


using en_core_web_trf
Sat. January 13th DATE


Australian NORP
February 2018 DATE


Random Rab WORK_OF_ART
The Chop Shop WORK_OF_ART
Edamame PERSON
March 8th 2018 DATE


using en_core_web_lg
January 13th DATE
February 2018 DATE
Random Rab PERSON
The Chop Shop w/ Edamame - March 8th ORG
2018 DATE


## Apply Spacy NER to the entire dataset

In [None]:
events_title.head()

In [None]:
event_list = events_title['splitted_text'].to_list()
results = []
for event_text in event_list:
    for text in event_text:
        ner_results = NER(text)
        for word in ner_results.ents:
            results.append({"word":word, "entities": word.label_})

In [None]:
events_title['splitted_text'].loc[1]

In [None]:
results[:15]