In [107]:
import json
import re
from datetime import datetime
import random
import nltk
import pickle
from nameparser.parser import HumanName
from nltk.corpus import wordnet
import names
import gender_guesser.detector as gender
import calendar

## Replace Band Name

In [16]:
with open('titles', 'rb') as inf:
    titles = pickle.load(inf)
    
determiners = []
nouns = []
adjectives = []
for title in titles:
    tagged_title = nltk.pos_tag(nltk.word_tokenize(title.lower()))
    for tagged_word in tagged_title:
        word = tagged_word[0]
        pos = tagged_word[1]
        if pos == 'DT':
            determiners.append(word)
        elif pos == 'NN' or pos == 'NNS':
            nouns.append(word.capitalize())
        elif pos == 'JJ':
            adjectives.append(word.capitalize())

In [34]:
def replace_band_name(text):
    def getRandName():
        determiner = random.choice(determiners).capitalize()
        [adjective1, adjective2] = random.sample(adjectives, 2)
        [noun1, noun2] = random.sample(nouns, 2)

        title_format = random.randrange(3)
        if title_format == 0:
            return determiner + ' ' + adjective1 + ' ' + noun1
        elif title_format == 1:
            return determiner + ' ' + adjective1 + ' ' + adjective2 + ' ' + noun1
        elif title_format == 2:
            return determiner + ' ' + noun1 + ' and ' + determiner + ' ' + noun2
        
    new_name = getRandName()
    text = re.sub('\[BAND_NAME\]', new_name, text)
    return text

## Replace Years

In [83]:
def replace_years(text):
    num_years = text.count('[YEAR]')
    first_year = datetime.now().year - (5 * num_years)
    years = [first_year]
    for i in range(1, num_years):
        years.append(years[i-1] + random.randint(0, 5))
        
    j = -1
    def get_year(matchobj):
        nonlocal j
        j += 1
        return years[j]

    return re.sub("\[YEAR\]", lambda x: str(get_year(x)), text)

## Replace Months

In [104]:
def replace_months(text):
    months = [calendar.month_name[i] for i in range(1,13)] + [calendar.month_abbr[i] for i in range(1,13)]
    return re.sub('\[MONTH\]', lambda x: random.choice(months), text)

## Replace Names (doesn't work currently)
https://stackoverflow.com/questions/20290870/improving-the-extraction-of-human-names-with-nltk
Shivansh bhandari's answer

In [23]:
person_list = []
person_names=person_list
def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)

    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing lone surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []
#     print (person_list)

names_ = get_human_names(text)
for person in person_list:
    person_split = person.split(" ")
    for name in person_split:
        if wordnet.synsets(name):
            if(name in person):
                person_names.remove(person)
                break

print(person_names)

['Stefanie Sargent', 'Valerie Agnew', 'Ben London', "Sick 'Em", 'Roisin Dunne', 'Home Alive', 'Miami Beach']


TODO: function to replace names with autogen names
maybe create dict that matches old to new
also need to replace last names as well as first names (maybe first names too?)
i think can replace all full names, then can look for last or first because shouldn't be an issue that we already replaced some

In [24]:
name_genders = {}
d = gender.Detector()
for name in person_names:
    g = d.get_gender(name.split()[0])
    print(name, g)
    if g == 'male' or g == 'female':
        name_genders[name] = g
name_genders

Stefanie Sargent female
Valerie Agnew female
Ben London male
Sick 'Em unknown
Roisin Dunne female
Home Alive unknown
Miami Beach unknown


{'Stefanie Sargent': 'female',
 'Valerie Agnew': 'female',
 'Ben London': 'male',
 'Roisin Dunne': 'female'}

In [25]:
name_replacements = {}
for k,v in name_genders.items():
    name_replacements[k] = names.get_full_name(gender=v)
name_replacements

{'Stefanie Sargent': 'Danielle Grady',
 'Valerie Agnew': 'Lucy Nelson',
 'Ben London': 'David Gambino',
 'Roisin Dunne': 'Emily Dancy'}

In [26]:
for k,v in name_replacements.items():
    text = re.sub(k.split()[0], v.split()[0], text) # replace first name
    text = re.sub(k.split()[1], v.split()[1], text) # replace last name

## Replace Genre

### Get all Genres

In [84]:
with open('data/consolidatedData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
genres = set()
for genre in data['allGenres']:
    if isinstance(genre, str):
        g = re.split(',|\[', genre)[0]
        genres.add(g)
        
genres = list(genres) # need to convert to list to be able to take a random choice from it

In [86]:
def replace_genre(text):
    return re.sub('\[GENRE\]', random.choice(genres), text)

## Replace All Placeholders

In [119]:
with open('data/consolidatedData.json', 'r', encoding='utf-8') as inf:
    paras = json.load(inf)

#with open('data/0.json', 'r', encoding='utf-8') as inf:
#    data = json.load(inf)
    
para = paras['allPrunedParaComplete'][1]
para += '[MONTH]'
print(para)
para = replace_band_name(para)
para = replace_years(para)
para = replace_months(para)
para = replace_genre(para)
print(para)

[BAND_NAME] is the third studio EP by the American rock band Alice in Chains, released on [MONTH] 25, [YEAR], through Columbia Records. This is Alice in Chains' second acoustic EP, preceded by [YEAR]'s Sap, and it is the first EP in music history to debut at No. 1 on the Billboard 200 chart, with the first week sales exceeding 141,000 copies in the United States. The self-produced EP was written and recorded over the course of just one week at the London Bridge Studio in Seattle. The tracks "No Excuses", "I Stay Away" and "Don't Follow" were released as singles to promote the album. [BAND_NAME] was nominated for two Grammy Awards in [YEAR]; Best Recording Package and Best Hard Rock Performance for "I Stay Away".
The EP was well received by critics and has been certified triple-platinum by the RIAA, selling 4 million copies worldwide, making [BAND_NAME] one of the band's most successful releases. In Canada, [BAND_NAME] was certified double-platinum for the sale of 200,000 copies. In Gre