In [42]:
import csv
import json
import glob
import numpy as np
import pandas as pd
import re
import requests
import string

from bs4 import BeautifulSoup
from pathlib import Path

# Constructing syllable data

In [54]:
def get_syllables(word):
    '''
    Returns a word separated into syllables from www.howmanysyllables.com
    '''
    URL = "https://www.howmanysyllables.com/syllables/" + word
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, "html.parser")
    syllable_content = soup.find(id="SyllableContentContainer")
    syllabified_w = syllable_content.find("span", class_="Answer_Red").text
    
    return syllabified_w

def get_syllable_stress(syllables, cmu_entry):
    '''
    Returns a list of tuples of the form (syllable, stress#).
    This represents the stress pattern of the given word.
    '''
    syl_stress = []
    for i in range(len(cmu_entry)):
        stress = re.sub('[^0-9]','', cmu_entry[i])
        syl_stress.append((syllables[i], stress))
    return syl_stress

In [23]:
cmu_comb = 'K AH2 M - B IH0 N - AH0 - B IH1 - L AH0 - T IY0'
syl_comb = get_syllables('combinability').split('-')

In [20]:
syl_comb.split('-')

['com', 'bin', 'a', 'bil', 'i', 'ty']

In [29]:
cmu = cmu_comb.split('-')
syl_stress = []

for s in range(len(cmu)):
    # delete everything except stress number, zip with syl_comb
    print(cmu[s])
    stress = re.sub('[^0-9]','', cmu[s])
    syl_stress.append((syl_comb[s], stress))
    
syl_stress

K AH2 M 
 B IH0 N 
 AH0 
 B IH1 
 L AH0 
 T IY0


[('com', '2'), ('bin', '0'), ('a', '0'), ('bil', '1'), ('i', '0'), ('ty', '0')]

In [42]:
# load cmudict
with open('cmudict.rep') as f:
    lines = f.readlines()

# now exclude any strings containing punctuation besides -, 
exclude = set(string.punctuation.replace('-', ''))

# filter and clean strings
cmu_dict = [s for s in lines if not any(c in s for c in exclude)]
cmu_cleaned = []
for entry in cmu_dict:
    entry = entry.replace('\n', '')
    e = entry.split('  ')
    cmu_cleaned.append((e[0], e[1]))

In [78]:
def process_data(batch, filename):
    data = []
    with open(filename, 'a') as f:
        for i in range(len(batch)):
            print(str(i) + "/" + str(len(batch)))
            entry = batch[i]
            word = entry[0]
            cmu = entry[1].split('-')
            try:
                syllables = get_syllables(word).split('-')
                if len(syllables) == len(cmu):
                    stress_pattern = get_syllable_stress(syllables, cmu)
                    info = {'word': word, 
                            'num_syllables': str(len(syllables)), 
                            'syllables': syllables, 
                            'stress_pattern': stress_pattern}
                    print(info)
                    data.append(info)
                    f.write(json.dumps(info) + '\n')
            except AttributeError:
                pass
        return data


# Reading syllable data

In [9]:
# reading the data from the files and concatenating into a single df

folder_path = 'syllable_data/'
file_list = glob.glob(folder_path + '*.txt')
data = []

for file in file_list:
    with open(file, 'r') as f:
        for line in f:
            d = json.loads(line.strip())
            data.append(d)
            
df = pd.DataFrame(data)

In [10]:
df['num_syllables'] = pd.to_numeric(df['num_syllables'])

In [13]:
df.to_csv('all_syllable_data.csv')

# Generating infix data

In [356]:
def insert_diddly(df_row):
    syllables = df_row['syllables']
    stress_pattern = df_row['stress_pattern']

    # bisyllabic w/ no reduplication
    if is_bisyllabic(df_row) and not has_initial_stress(df_row):
        return syllables[0] + '-diddly-' + syllables[1]
    # bisyllabic w/ reduplication 
    elif is_bisyllabic(df_row) and has_initial_stress(df_row):
        return diddly_reduplication(syllables, stress_pattern)
        
def diddly_reduplication(syllables, stress_pattern):
    consonants = 'bcdfghjklmnpqrstvwxz'
    vowels = 'aeiouy'
    new_syllables = []
    prefix = ''
    nuc = ''
    primary_stress = False # prevents double infixation for bisyllabic entries with both syllables labeled '1'
    
    for s in stress_pattern:
        syllable = s[0].lower()
        
        if s[1] == '1' and primary_stress == False: 
            primary_stress = True
            # reduplicate entire syllable if starts with vowel
            if syllable.startswith(tuple(vowels)):
                new_syllables.extend([syllable, '-diddly-', syllable])
            else:
                # otherwise reduplicate just the nucleus/coda
                for i in range(len(syllable)):
                    if syllable[i] in consonants:
                        prefix += syllable[i]
                    else:
                        nuc += syllable[i:len(syllable)]
                        break
                new_syllables.extend([prefix + nuc, '-diddly-', nuc])
        else:
            new_syllables.append(syllable)
    
    return ''.join(new_syllables)       
        
def insert_expletive(df_row):
    syllables = df_row['syllables'].copy()
    stress = df_row['stress_pattern'].copy()

    for i in range(len(syllables)):
        if stress[i][1] == '1':
            syllables.insert(i, '-fuckin-')
    return ''.join(syllables)


def insert_iz(df_row):
    word = df_row['word'].lower()
    syllables = df_row['syllables'].copy()
    stress = df_row['stress_pattern'].copy()
    
    if df_row['num_syllables'] == 1:
        # insert between nucleus and onset
        return btwn_nucleus_onset(word, '-iz-')

    elif df_row['num_syllables'] == 2:
        # insert to left of stressed vowel 
        for i in range(len(syllables)):
            if stress[i][1] == '1': 
                syllables[i] = btwn_nucleus_onset(syllables[i], '-iz-')
                print(word, syllables)
        if None in syllables:
            pass
        else:
            return ''.join(syllables)
    
            
def btwn_nucleus_onset(syllable, infix):
    consonants = 'bcdfghjklmnpqrstvwxz'
    vowels = 'aeiouy'
    prefix = ''
    for i in range(len(syllable)):
        if syllable[i] in consonants:
            prefix += syllable[i]
        elif syllable[i] in vowels:
            return syllable[:i] + infix + syllable[i:]

def has_initial_stress(df_row):
    stress = df_row['stress_pattern']
    if stress[0][1] == '1':
        return True
    else:
        return False
    
def is_monosyllabic(df_row):
    if df_row['num_syllables'] == 1:
        return True
    else:
        return False
    
def is_bisyllabic(df_row):
    if df_row['num_syllables'] == 2:
        return True
    else:
        return False       

In [99]:
'''
finding data that's good for expletive infixation:
- minimum bisyllabic
-- bisyllabic: no primary stress on first syllable
-- trisyllabic: primary stress is internal
'''
expletive_data = df.loc[df['num_syllables']>1]
# exclude everything where primary stress is on first syllable
expletive_data = expletive_data[expletive_data['stress_pattern'].apply(lambda x: x[0][1] != '1')] 
expletive_data['infixed'] = expletive_data.apply(insert_expletive, axis=1)
expletive_data.to_csv('expletive_data.csv')

In [198]:
'''
finding data that's good for iz infixation:
- btwn onset and nucleus of monosyllabic word
-- to the left of a stressed vowel in bisyllabic words
more than that isn't included bc rare
'''
iz_data = df.loc[df['num_syllables']<3]
iz_data['infixed'] = iz_data.apply(insert_iz, axis=1)
iz_data.to_csv('iz_data.csv')

tiber ['T-iz-i', 'ber']
tibet ['Ti', 'b-iz-et']
ticker ['t-iz-ick', 'er']
ticket ['t-iz-ick', 'et']
tickets ['t-iz-ick', 'ets']
ticking ['t-iz-ick', 'ing']
tickle ['t-iz-ick', 'le']
tickled ['t-iz-ick', 'led']
ticknor ['T-iz-ick', 'nor']
tidal ['t-iz-id', 'al']
tidbit ['t-iz-id', 'bit']
tiding ['t-iz-id', 'ing']
tidings ['t-iz-i', 'dings']
tidy ['t-iz-i', 'dy']
tiffin ['T-iz-if', 'fin']
tiger ['t-iz-i', 'ger']
tigers ['t-iz-i', 'gers']
tighten ['t-iz-ight', 'en']
tighter ['t-iz-ight', 'er']
tightest ['t-iz-ight', 'est']
tightly ['t-iz-ight', 'ly']
tightness ['t-iz-ight', 'ness']
tightrope ['t-iz-ight', 'rope']
tightwad ['t-iz-ight', 'wad']
tigon ['t-iz-i', 'gon']
tigris ['T-iz-i', 'gris']
tilden ['T-iz-il', 'den']
tiller ['t-iz-ill', 'er']
tilley ['t-iz-il', 'ley']
tillman ['t-iz-ill', 'man']
tilly ['T-iz-il', 'ly']
tilted ['t-iz-ilt', 'ed']
tilting ['t-iz-ilt', 'ing']
timber ['t-iz-im', 'ber']
timbre ['t-iz-im', 'bre']
timeless ['t-iz-ime', 'less']
timeline ['t-iz-ime', 'line']
timeli

sedum ['s-iz-e', 'dum']
seedling ['s-iz-eed', 'ling']
seedpod ['s-iz-eed', 'pod']
seedsman ['s-iz-eeds', 'man']
seedy ['s-iz-eed', 'y']
seeger ['S-iz-ee', 'ger']
lamely ['l-iz-ame', 'ly']
lament ['la', 'm-iz-ent']
lamer ['l-iz-am', 'er']
lamping ['l-iz-amp', 'ing']
lampoon ['lam', 'p-iz-oon']
lamppost ['l-iz-amp', 'post']
lamprey ['l-iz-am', 'prey']
lampreys ['l-iz-am', 'preys']
lampron ['l-iz-am', 'pron']
lancelet ['l-iz-ance', 'let']
lancer ['l-iz-anc', 'er']
lancers ['l-iz-anc', 'ers']
lancet ['l-iz-an', 'cet']
lancing ['l-iz-anc', 'ing']
landau ['l-iz-an', 'dau']
landed ['l-iz-and', 'ed']
landen ['L-iz-an', 'den']
lander ['l-iz-and', 'er']
landfall ['l-iz-and', 'fall']
landfill ['l-iz-and', 'fill']
landform ['l-iz-and', 'form']
landgrave ['l-iz-and', 'grave']
landing ['l-iz-and', 'ing']
landless ['l-iz-and', 'less']
landlock ['l-iz-and', 'lock']
landlocked ['l-iz-and', 'locked']
landlord ['l-iz-and', 'lord']
landman ['l-iz-and', 'man']
landmark ['l-iz-and', 'mark']
landmass ['l-iz-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iz_data['infixed'] = iz_data.apply(insert_iz, axis=1)


In [355]:
'''
finding data good for diddly infixation
reduplication on bisyllabic initial stress 
no reduplication on bisyllabic non-initial stress
exclude monosyllabic because coda is contested
'''

# bisyllabic + initial stress -- RED
bisyll = df[df.apply(is_bisyllabic, axis=1)]
initial_s = df[df.apply(has_initial_stress, axis=1)]
bisyll_init = pd.merge(bisyll, initial_s, how='inner', on=['word'], suffixes=('', '_remove'))
bisyll_init.drop([i for i in bisyll_init.columns if 'remove' in i], axis=1, inplace=True)

# bisyllabic + non-initial stress -- NO RED
no_initial_s = df[~df.apply(has_initial_stress, axis=1)]
bisyll_no_init = pd.merge(bisyll, no_initial_s, how='inner', on=['word'], suffixes=('', '_remove'))
bisyll_no_init.drop([i for i in bisyll_no_init.columns if 'remove' in i], axis=1, inplace=True)

diddly_data = pd.concat([bisyll_init, bisyll_no_init])
diddly_data['infixed'] = diddly_data.apply(insert_diddly, axis=1)
diddly_data.to_csv('diddly_data.csv')

# Creating subsets of data

In [23]:
expletive_df = pd.read_csv('infix_dataset/expletive_data.csv')
iz_df = pd.read_csv('infix_dataset/iz_data.csv')
diddly_df = pd.read_csv('infix_dataset/diddly_data.csv')

In [32]:
pd.set_option('display.max_rows', None)

In [41]:
expletive_df.sample(n=200)

Unnamed: 0,word,num_syllables,syllables,stress_pattern,infixed
6329,SEDATED,3.0,"['se', 'dat', 'ed']","[['se', '0'], ['dat', '1'], ['ed', '0']]",se-fuckin-dated
543,AMUSED,2.0,"['a', 'mused']","[['a', '0'], ['mused', '1']]",a-fuckin-mused
1098,CONVERGE,2.0,"['con', 'verge']","[['con', '0'], ['verge', '1']]",con-fuckin-verge
6957,INTRASTATE,3.0,"['in', 'tra', 'state']","[['in', '2'], ['tra', '0'], ['state', '1']]",intra-fuckin-state
10690,HYPOTENSION,4.0,"['hy', 'po', 'ten', 'sion']","[['hy', '2'], ['po', '0'], ['ten', '1'], ['sio...",hypo-fuckin-tension
3771,BELITTLE,3.0,"['be', 'lit', 'tle']","[['be', '0'], ['lit', '1'], ['tle', '0']]",be-fuckin-little
13379,EDITORIALLY,6.0,"['ed', 'i', 'to', 'ri', 'al', 'ly']","[['ed', '2'], ['i', '0'], ['to', '1'], ['ri', ...",edi-fuckin-torially
1726,PANACHE,2.0,"['pa', 'nache']","[['pa', '0'], ['nache', '1']]",pa-fuckin-nache
8284,GELATINOUS,4.0,"['ge', 'lat', 'i', 'nous']","[['ge', '0'], ['lat', '1'], ['i', '0'], ['nous...",ge-fuckin-latinous
11769,COMMUNICATOR,5.0,"['com', 'mu', 'ni', 'ca', 'tor']","[['com', '0'], ['mu', '1'], ['ni', '0'], ['ca'...",com-fuckin-municator


In [44]:
indices = []

with open('expletive_indices.csv', 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        indices.append(int(row[0]))

expletive_subset = expletive_df.iloc[indices]

In [54]:
es_df = expletive_subset[['word', 'infixed']]
es_df['word'] = es_df['word'].str.lower()
es_df['infixed'] = es_df['infixed'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  es_df['word'] = es_df['word'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  es_df['infixed'] = es_df['infixed'].str.lower()


In [55]:
es_df.to_csv('200_expletive_examples.csv', index=False, header=False)