# Meetup type: generation of training dataset

Approach:

- A curated list of keywords for each one of the meetup types
- Randomly select sentences with those words from the corpus
- Assign the relevant meetup type to each one of those sentences
- Save to `outputFile`

In [1]:
# Input
outputIndex = 2
sample_size = 500
outputFile = 'meetupType/prototypeSentences_' + str(outputIndex) + '.csv'

In [2]:
# Prepare seed terms for each meetup type

meetup_types = {"Music making" : [
"Conduct",
"Perform",
"Join",
"Produce",
"Write",
"Sing",
"Play",
# "Song",
"Record"],
"Business meeting": [
"Sign",
"Agree",
"Contract",
"Commit",
"Retire",
"Career",
"Hire",
"Debut"],
"Personal life": [
"Born",
"Die",
"Marry",
"Divorce",
"Adult",
"Child",
"Elder",
"Young",
"Father",
"Mother",
"Family"],
"Coincidence": [
"Meet",
"Stumble",
"Encounter",
"Find",
"Discover",
"Trigger"],
"Public celebration": [
"Award",
"Ceremony",
"Marriage",
"Funeral",
"Public",
"Celebration"],
"Education": [
"Learn",
"Teach",
"Mentor",
"Degree",
"University",
"Academia",
"Conservatoire"]}

print(meetup_types)



{'Music making': ['Conduct', 'Perform', 'Join', 'Produce', 'Write', 'Sing', 'Play', 'Record'], 'Business meeting': ['Sign', 'Agree', 'Contract', 'Commit', 'Retire', 'Career', 'Hire', 'Debut'], 'Personal life': ['Born', 'Die', 'Marry', 'Divorce', 'Adult', 'Child', 'Elder', 'Young', 'Father', 'Mother', 'Family'], 'Coincidence': ['Meet', 'Stumble', 'Encounter', 'Find', 'Discover', 'Trigger'], 'Public celebration': ['Award', 'Ceremony', 'Marriage', 'Funeral', 'Public', 'Celebration'], 'Education': ['Learn', 'Teach', 'Mentor', 'Degree', 'University', 'Academia', 'Conservatoire']}


In [3]:
# Gather prototype sentences and assign them to types
#### Sentence -> lemmas -> match any seed
# Compare query sentences with seed via sentence embeddings



In [4]:
folder = "indexedSentences"
import glob
files = set(glob.glob(folder + "/*"))
print(len(files), " files" )
import random
samples = random.sample(files, sample_size) # take n random biographies
print(sample_size, " samples" )
print(samples)

1002  files
500  samples
['indexedSentences/295286.csv', 'indexedSentences/1515991.csv', 'indexedSentences/156414.csv', 'indexedSentences/2308152.csv', 'indexedSentences/2928274.csv', 'indexedSentences/362109.csv', 'indexedSentences/1089533.csv', 'indexedSentences/623861.csv', 'indexedSentences/267627.csv', 'indexedSentences/833172.csv', 'indexedSentences/558968.csv', 'indexedSentences/1372632.csv', 'indexedSentences/1828359.csv', 'indexedSentences/1338619.csv', 'indexedSentences/141330.csv', 'indexedSentences/366362.csv', 'indexedSentences/788959.csv', 'indexedSentences/37931.csv', 'indexedSentences/3547359.csv', 'indexedSentences/1790990.csv', 'indexedSentences/3081864.csv', 'indexedSentences/1517676.csv', 'indexedSentences/820730.csv', 'indexedSentences/1354212.csv', 'indexedSentences/3136059.csv', 'indexedSentences/50798971.csv', 'indexedSentences/82413.csv', 'indexedSentences/50674114.csv', 'indexedSentences/1420817.csv', 'indexedSentences/782651.csv', 'indexedSentences/113049.csv

In [5]:
# Look for sentences matching any seed
# Using Spacy
# Install spaCy (run in terminal/prompt)
#import sys
#!{sys.executable} -m pip install spacy
#
# Download spaCy's  'en' Model
#!{sys.executable} -m spacy download en

#### Sentence -> lemmas -> match any seed
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
sentence = "The striped bats are hanging on their feet for best"
doc = nlp(sentence)
print("hang" in [token.lemma_ for token in doc])
#
# For each meetup type, load sentences matching any of the seed terms, in order to get a dictionary of types and prototype sentences.
# nlp in scope

True


In [6]:
# nlp is in scope

# Clean seeds
for mtype, mseeds in meetup_types.items():
    doc = nlp(" ".join(mseeds).lower())
    meetup_types[mtype] = [token.lemma_ for token in doc]

print(meetup_types)
for mtype, mseeds in meetup_types.items():
    print(mtype, type(mseeds))

{'Music making': ['conduct', 'perform', 'join', 'produce', 'write', 'sing', 'play', 'record'], 'Business meeting': ['sign', 'agree', 'contract', 'commit', 'retire', 'career', 'hire', 'debut'], 'Personal life': ['bear', 'die', 'marry', 'divorce', 'adult', 'child', 'elder', 'young', 'father', 'mother', 'family'], 'Coincidence': ['meet', 'stumble', 'encounter', 'find', 'discover', 'trigger'], 'Public celebration': ['award', 'ceremony', 'marriage', 'funeral', 'public', 'celebration'], 'Education': ['learn', 'teach', 'mentor', 'degree', 'university', 'academia', 'conservatoire']}
Music making <class 'list'>
Business meeting <class 'list'>
Personal life <class 'list'>
Coincidence <class 'list'>
Public celebration <class 'list'>
Education <class 'list'>


In [7]:
import csv

#sample_file = samples[0]

prototypeSentences = []
for sample_file in samples:
    with open(sample_file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                #print(f'Column names are {", ".join(row)}')
                line_count += 1
            else:
                line_count += 1
                #
                text = row[0]
                sentence = row[1]
                paragraph = row[2]
                section = row[3]
                wikiId = row[4]
                doc = nlp(text)
                terms = [ token.lemma_ for token in doc ]
                candidateTypes = {}
                include = False
                for mtype, mseeds in meetup_types.items():
                    matchSeed = set(mseeds).intersection(set(terms))
                    candidateTypes[mtype] = len(matchSeed) > 0
                    if candidateTypes[mtype]:
                        include = True
                data = {}
                if include:
                    data["sentences"] = text
                    data["sentenceIndex"] = sentence
                    data["paragraphIndex"] = paragraph
                    data["section"] = section
                    data["file"] = sample_file
                    data["wikiId"] = wikiId
                    for mtype, match in candidateTypes.items():
                        data[mtype] = match
                    #print(data)
                    prototypeSentences.append(data)
print( len(prototypeSentences) )

14447


In [8]:
# Write prototypes to file
columns = prototypeSentences[0].keys()

with open(outputFile, 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file,  fieldnames=columns)
    dict_writer.writeheader()
    dict_writer.writerows(prototypeSentences)

In [9]:
# print(prototypeSentences[1])
# print(prototypeSentences[2])
# print(prototypeSentences[3])