In [1]:
!pip install gender-guesser
import json
import re
import nltk
from nltk.corpus import wordnet
import calendar
import gender_guesser.detector as gender
import random

You should consider upgrading via the 'c:\users\movvam\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




In [3]:

with open("data/consolidatedData.json", 'r', encoding='utf-8') as f:
    consolidatedData = json.load(f)

p = random.choice(consolidatedData['allPrunedParaComplete'])

In [15]:

a = ['p1','p2','p3']
b = p[:2]
dict(zip(a, b))

{'p1': "[BAND_NAME] is a [GENRE] band formed in [MONTH] [YEAR]. After Ellegarden decided to go into hiatus in [YEAR], the group's guitarist [PERSON_NAME_FULL_0_MALE] created [BAND_NAME] as his solo project. Their debut album Parallel Lives was released on [MONTH] 6, [YEAR], debuting at No.\xa011 on the Japanese Oricon weekly album charts.\n",
 'p2': 'In [YEAR], the popular pop punk group Ellegarden announced that they would enter an indefinite hiatus. Guitarist [PERSON_NAME_FULL_0_MALE] used the break as an opportunity to start a new band called Nothing\'s Carved In Stone. He quickly recruited [PERSON_NAME_FULL_2_MALE] of Straightener to be his new bassist and Takanori "Oniy" Ohkita from FULLARMOR to join on drums, and they began holding sessions together. However, the band struggled to find a vocalist for nearly half a year. They eventually settled on [PERSON_NAME_FULL_1_MALE], the singer of an indie rock band called ABSTRACT MASH, who [PERSON_NAME_LAST_0] discovered through his band\

## Band Name Placeholder

In [3]:
def band_placeholder(data, texts):
    band_name = data['Name']
    if '(' in band_name: # some bands have disambiguation in the title such as "Mother Earth (American band)"
        band_name = ' '.join(band_name.split(' (')[:-1])
        
    return_texts = []
    for text in texts:
        text = re.sub(band_name, '[BAND_NAME]', text)
        return_texts.append(text)
        
    return return_texts

## Year Placeholder

In [5]:
def year_placeholder(texts):
    return_texts = []
    for text in texts:
        text = re.sub("[0-9]{4}", '[YEAR]', text)
        return_texts.append(text)
    return return_texts

## Month Placeholder

In [6]:
def month_placeholder(texts):
    return_texts = []
    for text in texts:
        months = [calendar.month_name[i] for i in range(1,13)] + [calendar.month_abbr[i] for i in range(1,13)]
        for month in months:
            text = re.sub(month, '[MONTH]', text)
        return_texts.append(text)
    return return_texts

## Name Placeholder
https://stackoverflow.com/questions/20290870/improving-the-extraction-of-human-names-with-nltk
Shivansh bhandari's answer

In [7]:
def get_person_names(data, text):
    person_list = []
    person_names=person_list
    def get_human_names(text):
        tokens = nltk.tokenize.word_tokenize(text)
        pos = nltk.pos_tag(tokens)
        sentt = nltk.ne_chunk(pos, binary = False)

        person = []
        name = ""
        for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
            for leaf in subtree.leaves():
                person.append(leaf[0])
            if len(person) > 1: #avoid grabbing lone surnames
                for part in person:
                    name += part + ' '
                if name[:-1] not in person_list:
                    person_list.append(name[:-1])
                name = ''
            person = []

    names_ = get_human_names(text)
    for person in person_list:
        person_split = person.split(" ")
        for name in person_split:
            if wordnet.synsets(name):
                if(name in person):
                    person_names.remove(person)
                    break
                    
    # add member names from infobox
    if 'infobox' in data.keys():
        members = []
        if 'Members' in data['infobox'].keys():
            members += data['infobox']['Members']

        if 'Past members' in data['infobox'].keys():
            members += data['infobox']['Past members']
            
        # some members have extra information that needs to be removed such as 'Earl Yager - bass'
        for member in members:
            member = member.split()
            if len(member) > 1:
                member = member[0] + member[1]
            else:
                member = member[0]

            person_names.append(member)
            
    person_names = set(person_names)
    return person_names

In [8]:
def get_name_genders(person_names):
    name_genders = {}
    d = gender.Detector()
    for name in person_names:
        g = d.get_gender(name.split()[0])
        if 'female' in g:
            name_genders[name] = 'FEMALE'
        elif 'male' in g:
            name_genders[name] = 'MALE'
        elif g == 'andy':
            name_genders[name] = random.choice(['MALE', 'FEMALE'])
        else:
            continue # unknown name
            
    return name_genders

In [9]:
def person_name_placeholder(data, texts):
    person_names = get_person_names(data, ''.join(texts))
    name_genders = get_name_genders(person_names)
    i = 0
    for name,gender in name_genders.items():
        return_texts = []
        for text in texts:
            text = re.sub(name, '[PERSON_NAME_FULL_' + str(i) + '_' + gender + ']', text)
            if len(name.split()) > 1: # some only have first names
                first = name.split()[0]
                last = name.split()[-1]
                # don't want abbreviations to be subsituted
                if len(first) > 2:
                    text = re.sub(first, '[PERSON_NAME_FIRST_' + str(i) + ']', text) # replace first name

                if len(last) > 2:
                    text = re.sub(last, '[PERSON_NAME_LAST_' + str(i) + ']', text) # replace last name
            return_texts.append(text)
        texts = return_texts
        i += 1
        
    return texts

## Genre Placeholder

### Get all genres

In [10]:
with open('data/consolidatedData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
genres = set()
for genre in data['allGenres']:
    if isinstance(genre, str):
        g = re.split(',|\[', genre)[0]
        genres.add(g)
        
genres = list(genres) # need to convert to list to be able to take a random choice from it

In [11]:
def genre_placeholder(texts):
    return_texts = []
    for text in texts:
        part2 = re.split(' a | an ', text)
        genre = None
        if len(part2) > 1:
            genre = re.split(' band', part2[1])[0]
        if genre:
            text = re.sub(genre, '[GENRE]', text)

        return_texts.append(text)

    return return_texts

## Paragraph Placeholder

In [18]:
def get_paragraph_placeholders(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    texts = data['rawData']
    texts = band_placeholder(data, texts)
    texts = year_placeholder(texts)
    texts = month_placeholder(texts)
    texts = person_name_placeholder(data, texts)
    try:
        texts = genre_placeholder(texts)
    except:
        pass
    #prunedData = []
    #for para in data['rawData']:
    #    prunedPara = band_placeholder(data, para)
    #    prunedPara = year_placeholder(prunedPara)
    #    prunedPara = month_placeholder(prunedPara)
    #    prunedPara = person_name_placeholder(data, prunedPara)
    #    try:
    #        prunedPara = genre_placeholder(prunedPara)
    #    except:
    #        pass
    #    prunedData.append(prunedPara)
    return texts

In [19]:
x = get_paragraph_placeholders('data/452.json')


["\n[BAND_NAME] was a [PERSON_NAME_FIRST_8]tian rock band that recorded during the early [YEAR]s.  The band's name is an acronym for the band's founder and lead singer, [PERSON_NAME_FIRST_1] [PERSON_NAME_FULL_7_MALE].\n", '[PERSON_NAME_FIRST_1] "[PERSON_NAME_FIRST_1]ny" [PERSON_NAME_FULL_7_MALE], Jr. ([MONTH] 11, [YEAR] – [MONTH] 5, [YEAR]) was born in Houston, Texas to [PERSON_NAME_FIRST_1] A. "Oscar" [PERSON_NAME_LAST_7], Sr. and [PERSON_NAME_FULL_0_FEMALE]. (Gray) [PERSON_NAME_LAST_7], one of three children ([PERSON_NAME_FIRST_9] and Wendy, his two siblings).  [PERSON_NAME_LAST_7] started playing drums at age 10 and taught himself to play guitar and piano. Graduating in [YEAR] from Channelview High School in Harris County, Texas, [PERSON_NAME_FIRST_1] attended Texas A&M University. He married his wife, Ricka Cox around [YEAR], with whom he had two children, [PERSON_NAME_FULL_1_MALE] and Annie Grace.\n', "Building on his passion for music, [PERSON_NAME_LAST_7] formed [BAND_NAME] in t

In [None]:
with open('data/consolidatedData.json', 'r', encoding='utf-8') as inf:
    data = json.load(inf)
    
complete_paras = []    
for i in range(1657):
    para_placeholder = None
    try:
        complete_paras.append(get_paragraph_placeholder('data/' + str(i) + '.json'))
    except:
        print(i)

data['allRawParaComplete'] = complete_paras

with open('data/consolidatedData.json', 'w', encoding='utf-8') as inf:
    json.dump(inf, data)