In [3]:
!pip install gender-guesser
import json
import re
import nltk
from nltk.corpus import wordnet
import calendar
import gender_guesser.detector as gender
import random

You should consider upgrading via the 'c:\users\movvam\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


## Band Name Placeholder

In [4]:
def band_placeholder(data, text):
    band_name = data['Name']
    if '(' in band_name: # some bands have disambiguation in the title such as "Mother Earth (American band)"
        band_name = ' '.join(band_name.split(' (')[:-1])
    text = re.sub(band_name, '[BAND_NAME]', text)
    return text

## Year Placeholder

In [5]:
def year_placeholder(text):
    return re.sub("[0-9]{4}", '[YEAR]', text)

## Month Placeholder

In [6]:
def month_placeholder(text):
    months = [calendar.month_name[i] for i in range(1,13)] + [calendar.month_abbr[i] for i in range(1,13)]
    for month in months:
        text = re.sub(month, '[MONTH]', text)
        
    return text

## Name Placeholder
https://stackoverflow.com/questions/20290870/improving-the-extraction-of-human-names-with-nltk
Shivansh bhandari's answer

In [7]:
def get_person_names(data, text):
    person_list = []
    person_names=person_list
    def get_human_names(text):
        tokens = nltk.tokenize.word_tokenize(text)
        pos = nltk.pos_tag(tokens)
        sentt = nltk.ne_chunk(pos, binary = False)

        person = []
        name = ""
        for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
            for leaf in subtree.leaves():
                person.append(leaf[0])
            if len(person) > 1: #avoid grabbing lone surnames
                for part in person:
                    name += part + ' '
                if name[:-1] not in person_list:
                    person_list.append(name[:-1])
                name = ''
            person = []

    names_ = get_human_names(text)
    for person in person_list:
        person_split = person.split(" ")
        for name in person_split:
            if wordnet.synsets(name):
                if(name in person):
                    person_names.remove(person)
                    break
                    
    # add member names from infobox
    if 'infobox' in data.keys():
        members = []
        if 'Members' in data['infobox'].keys():
            members += data['infobox']['Members']

        if 'Past members' in data['infobox'].keys():
            members += data['infobox']['Past members']
            
        # some members have extra information that needs to be removed such as 'Earl Yager - bass'
        for member in members:
            member = member.split()
            if len(member) > 1:
                member = member[0] + member[1]
            else:
                member = member[0]

            person_names.append(member)
            
    person_names = set(person_names)
    return person_names

In [14]:
def get_name_genders(person_names):
    name_genders = {}
    d = gender.Detector()
    for name in person_names:
        g = d.get_gender(name.split()[0])
        if 'female' in g:
            name_genders[name] = 'FEMALE'
        elif 'male' in g:
            name_genders[name] = 'MALE'
        elif g == 'andy':
            name_genders[name] = random.choice(['MALE', 'FEMALE'])
        else:
            continue # unknown name
            
    return name_genders

In [8]:
def person_name_placeholder(data, text):
    person_names = get_person_names(data, text)
    name_genders = get_name_genders(person_names)
    i = 0
    for name,gender in name_genders.items():
        text = re.sub(name, '[PERSON_NAME_FULL_' + str(i) + '_' + gender + ']', text)
        if len(name.split()) > 1: # some only have first names
            text = re.sub(name.split()[0], '[PERSON_NAME_FIRST_' + str(i) + ']', text) # replace first name
            text = re.sub(name.split()[1], '[PERSON_NAME_LAST_' + str(i) + ']', text) # replace last name
        i += 1
        
    return text

## Genre Placeholder

### Get all genres

In [9]:
with open('data/consolidatedData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
genres = set()
for genre in data['allGenres']:
    if isinstance(genre, str):
        g = re.split(',|\[', genre)[0]
        genres.add(g)
        
genres = list(genres) # need to convert to list to be able to take a random choice from it

In [10]:
def genre_placeholder(text):
    part2 = re.split(' a | an ', text)
    genre = None
    if len(part2) > 1:
        genre = re.split(' band', part2[1])[0]
    print(part2)   
    print(genre)
    if genre:
        text = re.sub(genre, '[GENRE]', text)
    
    return text

## Paragraph Placeholder

In [19]:
def get_paragraph_placeholders(json_file):
     with open(json_file, 'r', encoding='utf-8') as f:
         data = json.load(f)
        
    prunedData = []
    for para in data['rawData']:
        prunedPara = band_placeholder(data, para)
        prunedPara = year_placeholder(prunedPara)
        prunedPara = month_placeholder(prunedPara)
        prunedPara = person_name_placeholder(data, prunedPara)
        try:
            prunedPara = genre_placeholder(prunedPara)
        except:
            pass
        prunedData.append(prunedPara)
    return prunedData

In [20]:
x = get_paragraph_placeholder('data/4.json')

size of data:  3
['[BAND_NAME] was', 'short-lived band from Seattle, Washington formed in [YEAR] that has been described as sludge metal or grunge. Rock journalist [PERSON_NAME_FULL_0_MALE] writing in Allmusic describes the band\'s music as "rough and ready, sludgy guitar rock with', 'bad attitude".\n']
short-lived
["The band's first release was", 'single, [YEAR]\'s "Two Way Street"/"Six Foot Under", on Sub Pop. Both Mudhoney and Nirvana performed their first shows in Seattle opening for [BAND_NAME] at the Vogue in [YEAR].\n']
single, [YEAR]'s "Two Way Street"/"Six Foot Under", on Sub Pop. Both Mudhoney and Nirvana performed their first shows in Seattle opening for [BAND_NAME] at the Vogue in [YEAR].

["In [YEAR] Sub Pop released the band's first and only album,", 'five-track EP called Primal Rock Therapy which, despite being now recognised as', 'milestone record of that time and place, was at the time panned by the critics and ignored by the public. This contributed to the split of th

In [24]:
x[2]

"In [YEAR] Sub Pop released the band's first and only album, a [GENRE] a milestone record of that time and place, was at the time panned by the critics and ignored by the public. This contributed to the split of the band in [YEAR] after a North American tour with the French band Les Thugs.  Both bands played in San Francisco in [MONTH] [YEAR].\n"

In [13]:
for i in range(1657):
    try:
        get_paragraph_placeholder('data/' + str(i) + '.json')
    except:
        print(i)

 is cinematic in scope with soothing soundscapes of atmospheric, ambient, and shimmering chimes interspersed with crashing interludes...This isn\'t average post rock that drags and bores the listener, this is tight and well thought out songs that burst with color and expand with repeated listens." Their most recent release is', 'remix EP called Translations.\n']
American Indie rock
['[BAND_NAME] were', 'American synthpunk band, founded in [PERSON_NAME_FULL_0_FEMALE] in [YEAR] and active until [YEAR]. They were one of America\'s earliest electronic new wave bands, and have been cited (along with The Screamers and Suicide) as pioneers of synthpunk, also retrospectively known as "electropunk". [BAND_NAME] were notable for their use of synthesizers in place of guitars, and multimedia performances featuring multiple projections of satirical, instructional films critical of conformity and consumerism.\n[BAND_NAME] was one of the most popular bands of the [PERSON_NAME_FULL_0_FEMALE] punk and 