In [1]:
import pandas as pd

# Load Data

In [2]:
artists = pd.read_csv("data/artists.txt", sep="\t", header=None, names=['name'])
events_title = pd.read_csv("data/event_titles.txt", sep="\t", header=None, names=['title'])

# Explore Processing

After doing a general analysis on the event title dataset, here are some observations:

- The character "-" seems to separate information. More precisely, it looks like it could be useful to identify the location of the event, tour names or slogans
- The character "&" and "," seems to be separating artists.

In [3]:
chars_to_remove = ['•']
split_chars = ['-', '&']

In [4]:
def remove_characters(s: str):
    for char in chars_to_remove:
        s.replace(char, "");
    return s

In [5]:
events_title = events_title.applymap(lambda s: remove_characters(s) if type(s) == str else s)

In [6]:
import re

def split_text(s: str):
    result = re.split('&|-', s)
    return [r.strip() for r in result]

In [7]:
events_title['splitted_text'] = events_title.applymap(lambda s: split_text(s) if type(s) == str else s)

In [8]:
events_title.head()

Unnamed: 0,title,splitted_text
0,Jamey Johnson,[Jamey Johnson]
1,Alex the Astronaut & Stella Donnelly - Adelaid...,"[Alex the Astronaut, Stella Donnelly, Adelaide..."
2,Bad Bunny - La Nueva Religion Tour,"[Bad Bunny, La Nueva Religion Tour]"
3,Julien Baker at The Burl,[Julien Baker at The Burl]
4,SWING pres. Sam Paganini & Zøe,"[SWING pres. Sam Paganini, Zøe]"


In [27]:
class Entity:

    def __init__(self, text, label):
        self.text = text
        self.label = label


class EventTitle:

    def __init__(self, title, splitted_text):
        self.title = title
        self.splitted_text = splitted_text
        self.entities = []

In [28]:
event_titles = []
for title, splitted in zip(events_title['title'].to_list(), events_title['splitted_text'].to_list(),):
    event_titles.append(EventTitle(title=title, splitted_text=splitted))

# Spacy Model

In [29]:
import spacy
from spacy import displacy

In [30]:
NER = spacy.load("en_core_web_lg")

In [35]:
for event_title in event_titles:
    for text in event_title.splitted_text:
        ner_results = NER(text)
        for entity in ner_results.ents:
            new_entity = Entity(text=entity, label=entity.label_ )
            event_title.entities.append(new_entity)

In [39]:
print(event_titles[0].entities[0].label)
print(event_titles[0].entities[0].text)

PERSON
Jamey Johnson


## Different Entities Identified

In [18]:
set(entities)

{'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'TIME',
 'WORK_OF_ART'}

## Filter Results

In [19]:
filter_key = 'PERSON'
[result for result in results if result.get('entities') == filter_key]

[{'word': Jamey Johnson, 'entities': 'PERSON'},
 {'word': Alex the Astronaut, 'entities': 'PERSON'},
 {'word': Stella Donnelly, 'entities': 'PERSON'},
 {'word': Julien Baker, 'entities': 'PERSON'},
 {'word': Sam Paganini, 'entities': 'PERSON'},
 {'word': Dinosaur Jr Unoffical, 'entities': 'PERSON'},
 {'word': Ed Sheeran, 'entities': 'PERSON'},
 {'word': Brian Cid, 'entities': 'PERSON'},
 {'word': Bert Visscher, 'entities': 'PERSON'},
 {'word': Hij wordt, 'entities': 'PERSON'},
 {'word': moe, 'entities': 'PERSON'},
 {'word': Billie Eilish, 'entities': 'PERSON'},
 {'word': Rhiannon Giddens, 'entities': 'PERSON'},
 {'word': Bon Iver, 'entities': 'PERSON'},
 {'word': Madison, 'entities': 'PERSON'},
 {'word': Craig Charles, 'entities': 'PERSON'},
 {'word': Alazka, 'entities': 'PERSON'},
 {'word': Butch Anja Schneider La, 'entities': 'PERSON'},
 {'word': Jimi Jules, 'entities': 'PERSON'},
 {'word': Lacey Sturm, 'entities': 'PERSON'},
 {'word': Péterfy Bori, 'entities': 'PERSON'},
 {'word': S