In [1]:
import pandas as pd 
import zipfile
import plotly.graph_objects as go
import plotly.express as px
import pickle as pkl
import spacy
from nltk.corpus import wordnet
from word_forms.word_forms import get_word_forms
import collections
nlp = spacy.load("en_core_web_sm")

## MetaData

In [35]:
metadata = pd.read_json("../data/gutenberg-dammit-files/gutenberg-metadata.json")
metadata.columns

Index(['Author', 'Author Birth', 'Author Death', 'Author Given',
       'Author Surname', 'Copyright Status', 'Language', 'LoC Class', 'Num',
       'Subject', 'Title', 'charset', 'gd-num-padded', 'gd-path', 'href'],
      dtype='object')

In [36]:
# top authors
unique_authors = {}
def count_authors(author_list):
    if not isinstance(author_list, list):
        author_list = ['None Available']
    for val in author_list:
        if val in unique_authors:
            unique_authors[val] +=1 
        else: 
            unique_authors[val] = 1
    return len(author_list)
            
metadata['Number of Authors'] = metadata.apply(lambda row : count_authors(row['Author']), axis = 1)
top_20_authors = sorted(unique_authors.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_authors =  [list(x) for x in top_20_authors]
top_20_authors_names = [x[0] for x in top_20_authors]
top_20_authors_counts = [x[1] for x in top_20_authors]

fig = go.Figure(data=[go.Table(header=dict(values=['Author', 'Count']),
                 cells=dict(values=[top_20_authors_names,top_20_authors_counts] ))
                     ])
fig.show()

In [37]:
# top genres
unique_genres = {}

def count_genres(genre_list):
    if not isinstance(genre_list, list):
        genre_list = ['None Available']
    for val in genre_list:
        if val in unique_genres:
            unique_genres[val] +=1 
        else: 
            unique_genres[val] = 1
    return len(genre_list)

metadata['Number of Genres'] = metadata.apply(lambda row : count_genres(row['Subject']), axis = 1)
top_20_genres = sorted(unique_genres.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_genres =  [list(x) for x in top_20_genres]
top_20_genre_name = [x[0] for x in top_20_genres]
top_20_genre_counts = [x[1] for x in top_20_genres]

fig = go.Figure(data=[go.Table(header=dict(values=['Genre', 'Count']),
                 cells=dict(values=[top_20_genre_name,top_20_genre_counts] ))
                     ])
fig.show()

In [38]:
# entire corpus stats 
print('Total number of books: ' + str(metadata.shape[0]))
print('Books without a subject: ' + str(metadata[metadata['Subject'].isnull()].shape[0]))
print('Books with English language: ' + str(metadata.loc[metadata['Language'].isin([['English']])].shape[0]))
print('Unique authors: ' + str(len(unique_authors)))

Total number of books: 50729
Books without a subject: 8535
Books with English language: 41485
Unique authors: 18462


In [39]:
# narrow down corpus metadata to English, fiction, valid author birth 
metadata['is_english_fiction'] = metadata.apply(lambda row : True if isinstance(row['Subject'], list) and (any('fiction' in subject for subject in row['Subject']) or any('Fiction' in subject for subject in row['Subject'])) and (row['Language'] == ['English']) and (isinstance(row['Author Birth'], list) and None not in row['Author Birth'] and row['Author Birth'] != ['?'] and row['Author Birth'] != ['?', '?'] and len(row['Author Birth']) != 0) else False, axis = 1)
english_fiction_metadata = metadata.loc[metadata['is_english_fiction'] == True]
english_fiction_metadata['original_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path']
english_fiction_metadata['tokenized_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path'].str[:-4] + '_tagged.pickle' 
english_fiction_metadata['cw_df_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path'].str[:-4] + '_cw_df.pickle'
english_fiction_metadata.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(7588, 21)

In [40]:
# extract author birth century and save
def find_century(row):
    birth_years = row['Author Birth']
    century_list = []
    for year in birth_years:
        if year == '?':
            continue
        else:
            century = (int(year)) // 100
        if century < 15:
            century =  str('Before 15')
        if century not in century_list:
            century_list.append(century)
    return str(century_list[0]) + '00'
        
english_fiction_metadata['Author Birth Century'] = english_fiction_metadata.apply(lambda row: find_century(row), axis = 1)
english_fiction_metadata = english_fiction_metadata.sort_values(by=['Author Birth Century'])
english_fiction_metadata.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(7588, 22)

In [41]:
# plot out corpus by century 
fig = px.histogram(data_frame = english_fiction_metadata, x="Author Birth Century", title='Distribution of Author Birth Century').update_layout(yaxis_title="Frequency in Corpus")
fig.show()

In [42]:
english_fiction_metadata = english_fiction_metadata.sort_values(by=['gd-path'])
english_fiction_metadata.to_csv('../data/english_fiction_metadata.csv', index = False)

In [43]:
# exclude texts outside 1700-1900 range 
english_fiction_metadata = english_fiction_metadata.loc[english_fiction_metadata['Author Birth Century'].isin(['1700', '1800', '1900'])]

In [44]:
# top authors- english_fiction_metadata
unique_authors = {}
def count_authors(author_list):
    if not isinstance(author_list, list):
        author_list = ['None Available']
    for val in author_list:
        if val in unique_authors:
            unique_authors[val] +=1 
        else: 
            unique_authors[val] = 1
    return len(author_list)
            
english_fiction_metadata['Number of Authors'] = english_fiction_metadata.apply(lambda row : count_authors(row['Author']), axis = 1)
top_20_authors = sorted(unique_authors.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_authors =  [list(x) for x in top_20_authors]
top_20_authors_names = [x[0] for x in top_20_authors]
top_20_authors_counts = [x[1] for x in top_20_authors]

fig = go.Figure(data=[go.Table(header=dict(values=['Author', 'Count']),
                 cells=dict(values=[top_20_authors_names,top_20_authors_counts] ))
                     ])
fig.show()

In [45]:
# top genres - english_fiction_metadata
unique_genres = {}

def count_genres(genre_list):
    if not isinstance(genre_list, list):
        genre_list = ['None Available']
    for val in genre_list:
        if val in unique_genres:
            unique_genres[val] +=1 
        else: 
            unique_genres[val] = 1
    return len(genre_list)

english_fiction_metadata['Number of Genres'] = english_fiction_metadata.apply(lambda row : count_genres(row['Subject']), axis = 1)
top_20_genres = sorted(unique_genres.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_genres =  [list(x) for x in top_20_genres]
top_20_genre_name = [x[0] for x in top_20_genres]
top_20_genre_counts = [x[1] for x in top_20_genres]

fig = go.Figure(data=[go.Table(header=dict(values=['Genre', 'Count']),
                 cells=dict(values=[top_20_genre_name,top_20_genre_counts] ))
                     ])
fig.show()

In [46]:
# entire corpus stats 
print('Total number of books: ' + str(english_fiction_metadata.shape[0]))
print('Books without a subject: ' + str(english_fiction_metadata[english_fiction_metadata['Subject'].isnull()].shape[0]))
print('Books with English as language: ' + str(english_fiction_metadata.loc[english_fiction_metadata['Language'].isin([['English']])].shape[0]))
print('Unique authors: ' + str(len(unique_authors)))

Total number of books: 7487
Books without a subject: 0
Books with English as language: 7487
Unique authors: 1956


## Seed Words

In [88]:
# create smell seed words 
# Smell smell, scent, odor, odour, perfume, fragrance, essence, inhale, aroma, olfaction

smell_seed_words = []

base_set = [synset.name() for synset in wordnet.synsets('smell') if '_' not in synset.name().split('.')[0] ] +\
           [synset.name() for synset in wordnet.synsets('scent') if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('odor')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('perfume')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('inhale')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('fragrance')  if '_' not in synset.name().split('.')[0]]

remove = ['spirit.n.02', 'effect.n.04', 'kernel.n.03', 'smack.v.02']

smell_seed_words = [word for word in base_set if word not in remove and word not in smell_seed_words]


hyponyms_set = [y.name() for x in smell_seed_words for y in wordnet.synset(x).hyponyms() if '_' not in y.name().split('.')[0]]

remove = ['salute.v.04']

smell_seed_words = smell_seed_words + [word for word in hyponyms_set if word not in remove and word not in smell_seed_words]


hypernyms_set = [y.name() for x in smell_seed_words for y in wordnet.synset(x).hypernyms() if '_' not in y.name().split('.')[0]]

remove = ['sensation.n.01', 'exteroception.n.01', 'modality.n.03', 'suggest.v.05', 'sensing.n.02', 'perceive.v.01', 'groom.v.03', 'sensation.n.01', 'perceive.v.02']

smell_seed_words = smell_seed_words + [word for word in hypernyms_set if word not in remove and word not in smell_seed_words]

smell_seed_words = list(set(smell_seed_words))

print(smell_seed_words)

variations_set = []
for word in smell_seed_words: 
    variations= get_word_forms(word.split('.')[0])
    for key in variations: 
        for value in variations[key]: 
            if value not in variations_set: 
                variations_set.append(value + '.' + key)
                
smell_seed_words = [word.split('.')[0] + '.' + word.split('.')[1] for word in smell_seed_words]
smell_seed_words = list(set(smell_seed_words))
smell_seed_words = smell_seed_words + [word for word in variations_set if word not in smell_seed_words and '_' not in word.split('.')[0]]

smell_seed_words = list(set(smell_seed_words))

textfile = open("../data/seed_words/smell_words_list.txt", "w")
for element in smell_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(smell_seed_words)

['toiletry.n.01', 'sniff.v.01', 'snuffle.v.02', 'cense.v.01', 'smell.n.05', 'inhale.v.02', 'breathe.v.01', 'snuff.v.02', 'aspirate.v.03', 'inhale.v.01', 'sniff.v.02', 'smell.v.03', 'malodor.n.01', 'nose.n.06', 'bouquet.n.02', 'snuff.v.01', 'musk.n.02', 'smell.v.01', 'huff.v.01', 'aroma.n.02', 'odorize.v.01', 'puff.v.02', 'potpourri.n.03', 'perfume.v.02', 'smell.n.01', 'smell.v.02', 'acridity.n.01', 'patchouli.n.02', 'cologne.n.02', 'smell.n.04', 'incense.n.02', 'perfumery.n.01', 'sniff.n.01', 'reek.v.02', 'scent.n.02', 'perfume.v.01', 'perfume.n.02', 'smoke.v.01', 'scent.v.02', 'smell.v.05']


184

In [89]:
# create hear seed words 
# Hearing hear, listen, sound, loud, quiet, soft, audible, audio, voice, silence

hear_seed_words = []

base_set = [synset.name() for synset in wordnet.synsets('hear') if '_' not in synset.name().split('.')[0] ] +\
           [synset.name() for synset in wordnet.synsets('loud')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('listen') if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('voice')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('quiet')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('silence')  if '_' not in synset.name().split('.')[0]] 


remove = ['secrecy.n.01', 'spokesperson.n.01', 'part.n.11', 'heed.v.01', 'repose.n.03', 'tranquillity.n.01',
          'learn.v.02', 'calm.v.01', 'placid.s.01', 'phone.n.02', 'strait.n.01', 'fathom.v.02', 'healthy.s.04',
          'good.s.17', 'reasoned.s.01', 'legal.s.03', 'heavy.s.26', 'prepare.v.07', 'communication.n.02', 'means.n.01'
          'gag.v.01', 'pierce.v.03', 'play.v.07', 'play.v.13', 'ultrasound.n.01', 'unison.n.03', 'expression.n.03', 
          'catch.v.14', 'catch.v.21', 'incline.v.02', 'attend.v.05', 'incline.v.02', 'pat.n.01', 'express.v.02',
          'suppress.v.01', 'uncommunicativeness.n.01', 'condition.n.01', 'perceive.v.01', 'component.n.03', 'androglossia.n.01',
          'probe.v.01', 'concentrate.v.02', 'sensation.n.01', 'happening.n.01', 'look.v.02', 'calmness.n.02'
         ]

hear_seed_words = [word for word in base_set if word not in remove and word not in hear_seed_words]

hyponyms_set = [y.name() for x in hear_seed_words for y in wordnet.synset(x).hyponyms() if '_' not in y.name().split('.')[0]]

hear_seed_words = hear_seed_words + [word for word in hyponyms_set if word not in remove and word not in hear_seed_words]

hypernyms_set = [y.name() for x in hear_seed_words for y in wordnet.synset(x).hypernyms() if '_' not in y.name().split('.')[0]]

hear_seed_words = hear_seed_words + [word for word in hypernyms_set if word not in remove and word not in hear_seed_words]

hear_seed_words = list(set(hear_seed_words))

print(hear_seed_words)

variations_set = []
for word in hear_seed_words: 
    variations= get_word_forms(word.split('.')[0])
    for key in variations: 
        for value in variations[key]: 
            if value not in variations_set: 
                variations_set.append(value + '.' + key)
                
                
hear_seed_words = [word.split('.')[0] + '.' + word.split('.')[1] for word in hear_seed_words]
hear_seed_words = list(set(hear_seed_words))
hear_seed_words = hear_seed_words + [word for word in variations_set if word not in hear_seed_words and '_' not in word.split('.')[0]]

hear_seed_words = list(set(hear_seed_words))

textfile = open("../data/seed_words/hear_words_list.txt", "w")
for element in hear_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(hear_seed_words)

['quietness.n.01', 'silence.n.01', 'voice.n.03', 'pronounce.v.01', 'listen.v.02', 'voice.n.07', 'silence.n.02', 'hushed.s.01', 'hush.n.01', 'articulation.n.03', 'chirk.v.01', 'quaver.v.01', 'gag.v.01', 'quiet.a.02', 'muteness.n.02', 'hark.v.01', 'sprechgesang.n.01', 'hear.v.01', 'voice.n.10', 'shush.v.01', 'voice.n.06', 'rehear.v.01', 'hear.v.04', 'quiet.a.06', 'quietly.r.03', 'means.n.01', 'forte.a.01', 'hear.v.03', 'quiet.s.03', 'quiet.a.01', 'hush.v.02', 'silence.v.02', 'lull.n.02', 'lull.v.02', 'loud.a.01', 'voice.v.01', 'lung-power.n.01', 'sound.n.01', 'voice.n.02', 'voice.n.05', 'speechlessness.n.01', 'listen.v.01', 'voice.n.09', 'voice.n.01', 'sound.n.04', 'singer.n.01', 'brassy.s.02', 'loudly.r.01', 'voice.v.02', 'quieten.v.01']


192

In [90]:
# create touch seed words 
# Touch touch, feel, sense, sensation, rub, perceive, grasp, press, gentle, light

touch_seed_words = []

base_set = [synset.name() for synset in wordnet.synsets('touch') if '_' not in synset.name().split('.')[0] ] 
#            [synset.name() for synset in wordnet.synsets('rub')  if '_' not in synset.name().split('.')[0]]
#            [synset.name() for synset in wordnet.synsets('rub')  if '_' not in synset.name().split('.')[0]] +\
#            [synset.name() for synset in wordnet.synsets('grasp')  if '_' not in synset.name().split('.')[0]] 


remove = ['upset.v.01', 'get.v.29', 'border.v.05', 'crawl.v.02', 
           'sense.v.04', 'sense.v.02', 'sense.n.05', 'sense.n.03', 
           'sense.n.02', 'sense.n.01', 'find.v.05', 'catch.n.09', 
           'cover.v.02', 'upset.v.01', 'reason.v.01', 'foreplay.n.01', 
           'change.v.01', 'be.v.01', 'find.v.03', 'search.v.01', 
           'experience.v.01', 'think.v.01', 'be.v.03', 'look.v.02', 
           'awareness.n.01', 'meaning.n.01', 'faculty.n.01', 'detect.v.01',
           'understand.v.01', 'guide.v.05', 'irritate.v.02', 'influence.n.01',
           'manage.v.02', 'affect.v.05', 'adeptness.n.01', 'perception.n.03',
           'attack.n.07', 'act.n.02', 'examination.n.01', 'manner.n.01',
           'suggestion.n.02', 'exteroception.n.01', 'perceive.v.01',
           'solicitation.n.01', 'worry.v.06', 'puree.v.01', 'grate.v.03',
           'gauge.v.02', 'sensitivity.n.01', 'modality.n.03', 'handling.n.02', 
           'suffocate.v.06', 'sympathize.v.01', 'suffer.v.03', 'tag.n.05', 'tap.n.08',
           'smolder.v.02', 'sadden.v.02', 'repent.v.02', 'die.v.05','engage.v.06',
           'fume.v.01', 'glow.v.04', 'glow.v.05', 'harbor.v.01', 'attach.v.02',
           'incline.v.05', 'pride.v.01', 'recapture.v.01', 'rejoice.v.01',
           'fume.v.01', 'burn.v.06', 'tentacle.n.01', 'anger.v.02', 'snuff.n.02',
           'violate.v.03', 'sensitivity.n.01', 'creepiness.n.01', 'surround.v.01',
           'smell.v.05', 'grok.v.01', 'compass.n.03', 'appreciation.n.01', 'hang-up.n.02',
           'contact.n.08', 'refer.v.02', 'affect.v.01', 'equal.v.02', 'allude.v.01',
           'partake.v.03', 'tint.v.01', 'spirit.n.02', 'dig.n.05',
           'worry.v.06', 'puree.v.01', 'grate.v.03', 'gauge.v.02', 'sensitivity.n.01',
           'modality.n.03', 'suffocate.v.06', 'sympathize.v.01', 'suffer.v.03','grazing.n.02', 
           'smolder.v.02', 'sadden.v.02', 'repent.v.02', 'die.v.05', 'fume.v.01',
           'glow.v.04', 'glow.v.05', 'harbor.v.01', 'incline.v.05', 'pride.v.01', 'recapture.v.01',
           'rejoice.v.01', 'fume.v.01', 'burn.v.06', 'tentacle.n.01', 'anger.v.02',
           'violate.v.03', 'sensitivity.n.01', 'creepiness.n.01', 'surround.v.01', 'contact.n.04',
           'bid.v.03', 'crusade.v.01', 'imperativeness.n.01', 'iron.v.01', 'urge.v.01', 'mouth.v.03',
           'wardrobe.n.01', 'weigh.v.05', 'weight-lift.v.01', 'compress.v.02', 'compress.v.02']


touch_seed_words = [word for word in base_set if word not in remove and word not in touch_seed_words]

hyponyms_set = [y.name() for x in touch_seed_words for y in wordnet.synset(x).hyponyms() if '_' not in y.name().split('.')[0]]

touch_seed_words = touch_seed_words + [word for word in hyponyms_set if word not in remove and word not in touch_seed_words]

hypernyms_set = [y.name() for x in touch_seed_words for y in wordnet.synset(x).hypernyms() if '_' not in y.name().split('.')[0]]
    
touch_seed_words = touch_seed_words + [word for word in hypernyms_set if word not in remove and word not in touch_seed_words]

touch_seed_words = list(set(touch_seed_words))

print(touch_seed_words)

variations_set = []
for word in touch_seed_words: 
    variations= get_word_forms(word.split('.')[0])
    for key in variations: 
        for value in variations[key]: 
            if value not in variations_set: 
                variations_set.append(value + '.' + key)
                      
        
touch_seed_words = [word.split('.')[0] + '.' + word.split('.')[1] for word in touch_seed_words]
touch_seed_words = list(set(touch_seed_words))
touch_seed_words = touch_seed_words + [word for word in variations_set if word not in touch_seed_words and '_' not in word.split('.')[0]]

touch_seed_words = list(set(touch_seed_words))

textfile = open("../data/seed_words/touch_words_list.txt", "w")
for element in touch_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(touch_seed_words)

['kiss.n.04', 'toe.v.05', 'touch.n.09', 'handle.v.04', 'hit.v.02', 'tickle.n.02', 'stroke.n.05', 'touch.v.08', 'kiss.n.01', 'touch.n.05', 'hit.v.03', 'touch.n.12', 'contact.n.02', 'hit.n.02', 'snog.v.01', 'lick.n.02', 'touch.n.02', 'grope.n.01', 'touch.v.07', 'strike.v.10', 'touch.v.11', 'touch.n.03', 'brush.v.02', 'touch.v.01', 'rub.v.02', 'reach.v.06', 'touch.v.13', 'palpate.v.01', 'fingering.n.02', 'finger.v.01', 'stroke.v.01', 'tag.v.02', 'hug.v.02', 'somatosense.n.01', 'kiss.v.02', 'brush.n.03', 'touch.n.01', 'press.v.01', 'feel.v.13', 'touch.n.06', 'strike.v.01', 'touch.v.02', 'touch.n.04', 'touch.v.03', 'touch.n.10', 'touch.n.11', 'cling.v.01', 'touch.v.05', 'palpation.n.01', 'stroke.n.04', 'touch.n.08']


195

In [91]:
# create taste seed words 
# Taste taste, flavor, savor, savour, palate, bite, mouthful, morsel, eat, teeth

taste_seed_words = []

base_set = [synset.name() for synset in wordnet.synsets('taste') if '_' not in synset.name().split('.')[0] ] +\
           [synset.name() for synset in wordnet.synsets('flavor')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('savor')  if '_' not in synset.name().split('.')[0]] +\
           [synset.name() for synset in wordnet.synsets('palate')  if '_' not in synset.name().split('.')[0]]  +\
           [synset.name() for synset in wordnet.synsets('bite')  if '_' not in synset.name().split('.')[0]]

remove = ['preference.n.01', 'sample.v.01', 'spirit.n.02', 'enjoy.v.01', 'sting.n.03', 'pungency.n.01', 'pungency.n.02',
         'sting.v.02', 'vogue.n.01', 'surface.n.02', 'culture.n.02', 'nettle.v.01', 'flatness.n.03', 'strangeness.n.02', 
          'sensation.n.01', 'success.n.01', 'experience.n.03', 'sensing.n.02', 'pierce.v.05', 'finish.n.08', 'grip.v.01',
          'clip.n.05', 'know.v.05', 'exteroception.n.01', 'modality.n.03', 'kind.n.01', 'subtraction.n.02', 'discrimination.n.02',
          'virtu.n.01', 'identify.v.06', 'perceive.v.01', 'wound.n.01', 'ache.v.03', 'charm.n.04', 'lemon.n.04', 'curry.v.01',
          'season.v.01', 'refreshment.n.01', 'salt.v.01', 'crumb.n.03', 'snakebite.n.01', 'vanilla.n.03', 'sop.n.01', 'snap.v.12'
         ]

taste_seed_words = [word for word in base_set if word not in remove and word not in taste_seed_words]

hyponyms_set = [y.name() for x in taste_seed_words for y in wordnet.synset(x).hyponyms() if '_' not in y.name().split('.')[0]]

taste_seed_words = taste_seed_words + [word for word in hyponyms_set if word not in remove and word not in taste_seed_words]

hypernyms_set = [y.name() for x in taste_seed_words for y in wordnet.synset(x).hypernyms() if '_' not in y.name().split('.')[0]]
    
taste_seed_words = taste_seed_words + [word for word in hypernyms_set if word not in remove and word not in taste_seed_words]

taste_seed_words = list(set(taste_seed_words))

print(taste_seed_words)

variations_set = []
for word in taste_seed_words: 
    variations= get_word_forms(word.split('.')[0])
    for key in variations: 
        for value in variations[key]: 
            if value not in variations_set: 
                variations_set.append(value + '.' + key)
                      
        
taste_seed_words = [word.split('.')[0] + '.' + word.split('.')[1] for word in taste_seed_words]
taste_seed_words = list(set(taste_seed_words))
taste_seed_words = taste_seed_words + [word for word in variations_set if word not in taste_seed_words and '_' not in word.split('.')[0]]

taste_seed_words = list(set(taste_seed_words))

textfile = open("../data/seed_words/taste_words_list.txt", "w")
for element in taste_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(taste_seed_words)

['nip.n.06', 'taste.n.05', 'flavor.n.03', 'chew.v.01', 'gnaw.v.01', 'savor.v.04', 'bite.n.01', 'eating.n.01', 'relish.n.03', 'morsel.n.02', 'bite.n.04', 'bitter.n.02', 'nosh.n.01', 'taste.v.06', 'taste.n.07', 'smack.v.03', 'astringency.n.01', 'nibble.n.02', 'palate.n.01', 'taste.n.03', 'bite.v.01', 'bite.v.03', 'swallow.n.01', 'munch.n.02', 'sweet.n.04', 'taste.v.05', 'salt.n.04', 'savor.v.03', 'nibble.v.02', 'bite.n.05', 'meal.n.01', 'bite.n.08', 'mellowness.n.02', 'taste.n.04', 'delicacy.n.03', 'bite.v.02', 'nip.v.02', 'taste.n.01', 'taste.v.01', 'taste.v.02', 'sour.n.02', 'nibble.v.01', 'taste.n.06', 'chew.n.01', 'bite.n.09']


194

In [92]:
# create sight seed words 
# Sight see, look, visual, glance, stare, gaze, view, observe, notice, watch

sight_seed_words = []

base_set = [synset.name() for synset in wordnet.synsets('see') if '_' not in synset.name().split('.')[0] ] +\
           [synset.name() for synset in wordnet.synsets('look')  if '_' not in synset.name().split('.')[0]]

remove = ['be.v.01', 'reinterpret.v.02', 'reconsider.v.01', 'bet.v.02', 'tour.v.01', 'favor.v.02', 'accompany.v.02', 'consider.v.05',
          'appreciate.v.02', 'sparkle.n.01', 'identify.v.03', 'intrude.v.03', 'card.v.02', 'perceive.v.01',' trust.v.01', 'zeitgeist.n.01',
          'come.v.20', 'relativize.v.01', 'learn.v.02','control.v.06', 'visit.v.01', 'minister.v.01', 'read.v.01', 'count.v.08', 
          'hollywood.n.02', 'interpret.v.01', 'proofread.v.01', 'check.v.02', 'attend.v.02', 'idealize.v.01', 'gloat.v.02',
          'prize.v.01', 'glow.v.02', 'deem.v.01', 'intersect.v.01', 'undergo.v.01', 'seat.n.05', 'glitter.v.01', 
          'front.v.01', 'dekko.n.01', 'determine.v.08', 'cross-check.v.01', 'meet.v.01', 'experience.v.01', 'rise.v.04'
          'preview.v.01', 'atmosphere.n.01', 'enjoy.v.04', 'sound.v.01', 'feel.v.06', 'think.v.01',  'anticipate.v.05'
          'search.v.02', 'receive.v.13', 'autopsy.v.01', 'convey.v.01', 'perceive.v.02', 'make.v.43', 'understand.v.02',
          'feel.v.12', 'disrespect.v.02', 'candle.v.01', 'expect.v.05', 'catch.v.28', 'reconsider.v.02', 'reify.v.01',
          'respect.v.01', 'make.v.35', 'call.v.27', 'like.v.04', 'capitalize.v.05', 'regard.v.02', 'anticipate.v.05'
          'expression.n.01', 'spot-check.v.01', 'expect.v.04', 'cruise.v.03', 'cinch.v.02', 'expression.n.01',
          'detect.v.01', 'auscultate.v.01',  'match.v.01', 'behold.v.01', 'imagine.v.01', 'visit.v.03', 'receive.v.05',
          'cut.v.13', 'search.v.04', 'sensing.n.02', 'verify.v.01', 'expect.v.03', 'x-ray.v.01', 'loom.v.01', 'feel.v.07', 'cover.v.13',
          'countenance.n.01',  'prospect.v.01','include.v.02', 'spirit.n.02', 'rise.v.04', 'observation.n.02', 'consider.v.09',
          'hunt.v.07',  'admire.v.02', 'abstract.v.01', 'double-check.v.01', 'appearance.n.01', 'preview.v.01', 'sightseeing.n.01',
          'search.v.02', 'survey.v.02'
         ]

sight_seed_words = [word for word in base_set if word not in remove and word not in sight_seed_words]

hyponyms_set = [y.name() for x in sight_seed_words for y in wordnet.synset(x).hyponyms() if '_' not in y.name().split('.')[0]]

sight_seed_words = sight_seed_words + [word for word in hyponyms_set if word not in remove and word not in sight_seed_words]

hypernyms_set = [y.name() for x in sight_seed_words for y in wordnet.synset(x).hypernyms() if '_' not in y.name().split('.')[0]]
    
sight_seed_words = sight_seed_words + [word for word in hypernyms_set if word not in remove and word not in sight_seed_words]

sight_seed_words = list(set(sight_seed_words))

print(sight_seed_words)

variations_set = []
for word in sight_seed_words: 
    variations= get_word_forms(word.split('.')[0])
    for key in variations: 
        for value in variations[key]: 
            if value not in variations_set: 
                variations_set.append(value + '.' + key)
                      
        
sight_seed_words = [word.split('.')[0] + '.' + word.split('.')[1] for word in sight_seed_words]
sight_seed_words = list(set(sight_seed_words))
sight_seed_words = sight_seed_words + [word for word in variations_set if word not in sight_seed_words and '_' not in word.split('.')[0]]

sight_seed_words = list(set(sight_seed_words))

textfile = open("../data/seed_words/sight_words_list.txt", "w")
for element in sight_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(sight_seed_words)

['look.v.02', 'peruse.v.01', 'see.v.05', 'visualize.v.02', 'see.v.10', 'see.v.17', 'look.v.09', 'see.v.22', 'squint.n.02', 'look.n.03', 'watch.v.03', 'see.v.12', 'lookout.n.04', 'leer.v.01', 'see.v.23', 'peer.v.01', 'look.v.07', 'peep.v.01', 'peek.n.01', 'goggle.v.01', 'stare.v.02', 'stare.n.01', 'scan.v.01', 'squint.v.02', 'scan.v.02', 'scrutiny.n.02', 'spectate.v.01', 'examine.v.02', 'squint.v.03', 'inspect.v.01', 'look.v.01', 'eye.v.01', 'look.n.02', 'view.n.03', 'see.n.01', 'watch.v.01', 'witness.v.02', 'gaze.v.01', 'ogle.v.01', 'visualize.v.01', 'see.v.19', 'glimpse.v.01', 'see.v.11', 'see.v.15', 'see.v.18', 'look.v.03', 'glance.v.01', 'glance.n.01', 'see.v.01']


195

In [97]:
# find duplicates 
all_seed_words =  sight_seed_words + taste_seed_words + hear_seed_words + smell_seed_words + touch_seed_words
print([item for item, count in collections.Counter(all_seed_words).items() if count > 1])
print(set([word.split('.')[1] for word in all_seed_words]))

[]
{'r', 'n', 'v', 'a', 's'}


In [99]:
# assign POS using spacy's POS tags
list_of_seed_words = {'sight': sight_seed_words, 
                      'taste' : taste_seed_words, 
                      'hear' : hear_seed_words, 
                      'smell': smell_seed_words, 
                      'touch' : touch_seed_words}

seed_words = pd.DataFrame(columns = ['seed_word', 'word', 'pos', 'sense_name'])
pos_dict = {'n': 'NOUN', 'v': 'VERB', 'r': 'ADV', 's': 'ADJ', 'a': 'ADJ'}

for key, value in list_of_seed_words.items():
    count = 0 
    for w in value:
        count += 1
        word = w.split('.')[0]
        pos = pos_dict[w.split('.')[1]]
        seed_word = (word, pos)
        sense_name = key
        seed_words = seed_words.append({'seed_word' : seed_word,
                                        'word' : word,
                                        'pos' : pos,
                                        'sense_name' : sense_name}, ignore_index = True)
    print(key + ": " + str(count))
    
print('Total: ' + str(len(all_seed_words)))

sight: 195
taste: 194
hear: 192
smell: 184
touch: 195
Total: 960


In [101]:
seed_words.tail()

Unnamed: 0,seed_word,word,pos,sense_name
955,"(somatosense, NOUN)",somatosense,NOUN,touch
956,"(pressuring, VERB)",pressuring,VERB,touch
957,"(kisses, NOUN)",kisses,NOUN,touch
958,"(striking, NOUN)",striking,NOUN,touch
959,"(hitter, NOUN)",hitter,NOUN,touch


In [102]:
seed_words.to_csv('../data/seed_words/seed_words.csv', index = False)

with open('../data/seed_words/seed_words.pickle', 'wb') as f:
        pkl.dump(seed_words, f)

In [None]:
with open(input_path, "rb") as f:
    input_obj = pkl.load(f)

### Old Seed Words Method

In [None]:
smell_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('smell')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synsets('scent')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.04').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.04').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('scent.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('scent.n.01').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.02').hypernyms()]

remove = ['spirit', 'perceive', 'sensation', 'property', 'exteroception', 'modality']

smell_seed_words = [word for word in smell_seed_words if word not in remove]

for word in smell_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in smell_seed_words: 
                smell_seed_words.append(value)
                
smell_seed_words = [word for word in smell_seed_words if '_' not in word]    
    
smell_seed_words = list(set(smell_seed_words))

remove = ['spirit', 'perceive', 'sensation', 'property', 'exteroception', 'modality', 'nosinesses', 'nosiness']

smell_seed_words = [word for word in smell_seed_words if word not in remove]

smell_seed_words = [word for word in smell_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + smell_seed_words 

textfile = open("../data/seed_words/smell_words_list.txt", "w")
for element in smell_seed_words:
    textfile.write(element + "\n")
textfile.close()

print(smell_seed_words)
print(len(smell_seed_words))

In [None]:
# create hear seed words list
hear_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('hear')] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('hear.v.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('hear.v.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('listen.v.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('listen.v.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.v.06').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.v.06').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.02').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.02').hypernyms()]

for word in hear_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in hear_seed_words: 
                hear_seed_words.append(value)
            
hear_seed_words = [word for word in hear_seed_words if '_' not in word]
            
hear_seed_words = list(set(hear_seed_words))

hear_seed_words = [word for word in hear_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + hear_seed_words 

textfile = open("../data/seed_words/hear_words_list.txt", "w")
for element in hear_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(hear_seed_words)

In [None]:
# create touch seed words list
touch_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('touch')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.v.01').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('feel.n.03').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('feel.n.03').hypernyms()]

for word in touch_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in touch_seed_words: 
                touch_seed_words.append(value)
                
touch_seed_words = [word for word in touch_seed_words if '_' not in word]
                
touch_seed_words = list(set(touch_seed_words))

touch_seed_words = [word for word in touch_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + touch_seed_words 

textfile = open("../data/seed_words/touch_words_list.txt", "w")
for element in touch_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(touch_seed_words)

In [None]:
# create taste seed words 
taste_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('taste')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.03').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.03').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('savor.v.04').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('savor.v.04').hypernyms()]

for word in taste_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in taste_seed_words: 
                taste_seed_words.append(value)
                
taste_seed_words = [word for word in taste_seed_words if '_' not in word]                
                
taste_seed_words = list(set(taste_seed_words))

taste_seed_words = [word for word in taste_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + taste_seed_words 

textfile = open("../data/seed_words/taste_words_list.txt", "w")
for element in taste_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(taste_seed_words)

In [None]:
# create sight seed words list 
# Sight see, look, visual, glance, stare, gaze, view, observe, notice, watch
sight_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('see')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('see.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('look.v.01').hyponyms()]
                   

for word in sight_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in sight_seed_words: 
                sight_seed_words.append(value)
                
sight_seed_words = [word for word in sight_seed_words if '_' not in word]

sight_seed_words = list(set(sight_seed_words))

sight_seed_words = [word for word in sight_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + sight_seed_words 

textfile = open("../data/seed_words/sight_words_list.txt", "w")
for element in sight_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(sight_seed_words)

In [None]:
# POS tag words 
def pos_tag_words(word):
    for token in nlp(word):
        return (str(token.text).lower(), str(token.pos_))
    
seed_words['word_pos'] = seed_words.apply(lambda row: pos_tag_words(row['word']), axis = 1)

In [None]:
# read seed words 
modalities = ['sight', 'hear', 'touch', 'taste', 'smell']
seed_words = pd.DataFrame(columns = ['word', 'sense_name'])
overlapping_words = []

for sense_name in modalities:
            with open('../data/seed_words/' + sense_name + '_words_list.txt', 'r') as filehandle:
                count = 0 
                for line in filehandle:
                    count += 1
                    currentPlace = line[:-1]
                    if seed_words['word'].str.contains(currentPlace).any():
                        overlapping_words.append(sense_name + '_' + currentPlace)
                    seed_words = seed_words.append({'word' : currentPlace, 'sense_name' : sense_name}, ignore_index = True)
                print(sense_name + ": " + str(count))
print('Total: ' + str(seed_words.shape[0]))