In [34]:
import pandas as pd 
import zipfile
import plotly.graph_objects as go
import plotly.express as px
import pickle as pkl
import spacy
from nltk.corpus import wordnet
from word_forms.word_forms import get_word_forms
nlp = spacy.load("en_core_web_sm")

## MetaData

In [35]:
metadata = pd.read_json("../data/gutenberg-dammit-files/gutenberg-metadata.json")
metadata.columns

Index(['Author', 'Author Birth', 'Author Death', 'Author Given',
       'Author Surname', 'Copyright Status', 'Language', 'LoC Class', 'Num',
       'Subject', 'Title', 'charset', 'gd-num-padded', 'gd-path', 'href'],
      dtype='object')

In [36]:
# top authors
unique_authors = {}
def count_authors(author_list):
    if not isinstance(author_list, list):
        author_list = ['None Available']
    for val in author_list:
        if val in unique_authors:
            unique_authors[val] +=1 
        else: 
            unique_authors[val] = 1
    return len(author_list)
            
metadata['Number of Authors'] = metadata.apply(lambda row : count_authors(row['Author']), axis = 1)
top_20_authors = sorted(unique_authors.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_authors =  [list(x) for x in top_20_authors]
top_20_authors_names = [x[0] for x in top_20_authors]
top_20_authors_counts = [x[1] for x in top_20_authors]

fig = go.Figure(data=[go.Table(header=dict(values=['Author', 'Count']),
                 cells=dict(values=[top_20_authors_names,top_20_authors_counts] ))
                     ])
fig.show()

In [37]:
# top genres
unique_genres = {}

def count_genres(genre_list):
    if not isinstance(genre_list, list):
        genre_list = ['None Available']
    for val in genre_list:
        if val in unique_genres:
            unique_genres[val] +=1 
        else: 
            unique_genres[val] = 1
    return len(genre_list)

metadata['Number of Genres'] = metadata.apply(lambda row : count_genres(row['Subject']), axis = 1)
top_20_genres = sorted(unique_genres.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_genres =  [list(x) for x in top_20_genres]
top_20_genre_name = [x[0] for x in top_20_genres]
top_20_genre_counts = [x[1] for x in top_20_genres]

fig = go.Figure(data=[go.Table(header=dict(values=['Genre', 'Count']),
                 cells=dict(values=[top_20_genre_name,top_20_genre_counts] ))
                     ])
fig.show()

In [38]:
# entire corpus stats 
print('Total number of books: ' + str(metadata.shape[0]))
print('Books without a subject: ' + str(metadata[metadata['Subject'].isnull()].shape[0]))
print('Books with English language: ' + str(metadata.loc[metadata['Language'].isin([['English']])].shape[0]))
print('Unique authors: ' + str(len(unique_authors)))

Total number of books: 50729
Books without a subject: 8535
Books with English language: 41485
Unique authors: 18462


In [39]:
# narrow down corpus metadata to English, fiction, valid author birth 
metadata['is_english_fiction'] = metadata.apply(lambda row : True if isinstance(row['Subject'], list) and (any('fiction' in subject for subject in row['Subject']) or any('Fiction' in subject for subject in row['Subject'])) and (row['Language'] == ['English']) and (isinstance(row['Author Birth'], list) and None not in row['Author Birth'] and row['Author Birth'] != ['?'] and row['Author Birth'] != ['?', '?'] and len(row['Author Birth']) != 0) else False, axis = 1)
english_fiction_metadata = metadata.loc[metadata['is_english_fiction'] == True]
english_fiction_metadata['original_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path']
english_fiction_metadata['tokenized_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path'].str[:-4] + '_tagged.pickle' 
english_fiction_metadata['cw_df_path'] = '../data/gutenberg-dammit-files/' + english_fiction_metadata['gd-path'].str[:-4] + '_cw_df.pickle'
english_fiction_metadata.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(7588, 21)

In [40]:
# extract author birth century and save
def find_century(row):
    birth_years = row['Author Birth']
    century_list = []
    for year in birth_years:
        if year == '?':
            continue
        else:
            century = (int(year)) // 100
        if century < 15:
            century =  str('Before 15')
        if century not in century_list:
            century_list.append(century)
    return str(century_list[0]) + '00'
        
english_fiction_metadata['Author Birth Century'] = english_fiction_metadata.apply(lambda row: find_century(row), axis = 1)
english_fiction_metadata = english_fiction_metadata.sort_values(by=['Author Birth Century'])
english_fiction_metadata.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



(7588, 22)

In [41]:
# plot out corpus by century 
fig = px.histogram(data_frame = english_fiction_metadata, x="Author Birth Century", title='Distribution of Author Birth Century').update_layout(yaxis_title="Frequency in Corpus")
fig.show()

In [42]:
english_fiction_metadata = english_fiction_metadata.sort_values(by=['gd-path'])
english_fiction_metadata.to_csv('../data/english_fiction_metadata.csv', index = False)

In [43]:
# exclude texts outside 1700-1900 range 
english_fiction_metadata = english_fiction_metadata.loc[english_fiction_metadata['Author Birth Century'].isin(['1700', '1800', '1900'])]

In [44]:
# top authors- english_fiction_metadata
unique_authors = {}
def count_authors(author_list):
    if not isinstance(author_list, list):
        author_list = ['None Available']
    for val in author_list:
        if val in unique_authors:
            unique_authors[val] +=1 
        else: 
            unique_authors[val] = 1
    return len(author_list)
            
english_fiction_metadata['Number of Authors'] = english_fiction_metadata.apply(lambda row : count_authors(row['Author']), axis = 1)
top_20_authors = sorted(unique_authors.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_authors =  [list(x) for x in top_20_authors]
top_20_authors_names = [x[0] for x in top_20_authors]
top_20_authors_counts = [x[1] for x in top_20_authors]

fig = go.Figure(data=[go.Table(header=dict(values=['Author', 'Count']),
                 cells=dict(values=[top_20_authors_names,top_20_authors_counts] ))
                     ])
fig.show()

In [45]:
# top genres - english_fiction_metadata
unique_genres = {}

def count_genres(genre_list):
    if not isinstance(genre_list, list):
        genre_list = ['None Available']
    for val in genre_list:
        if val in unique_genres:
            unique_genres[val] +=1 
        else: 
            unique_genres[val] = 1
    return len(genre_list)

english_fiction_metadata['Number of Genres'] = english_fiction_metadata.apply(lambda row : count_genres(row['Subject']), axis = 1)
top_20_genres = sorted(unique_genres.items(), key=lambda x: x[1], reverse = True)[:20]
top_20_genres =  [list(x) for x in top_20_genres]
top_20_genre_name = [x[0] for x in top_20_genres]
top_20_genre_counts = [x[1] for x in top_20_genres]

fig = go.Figure(data=[go.Table(header=dict(values=['Genre', 'Count']),
                 cells=dict(values=[top_20_genre_name,top_20_genre_counts] ))
                     ])
fig.show()

In [46]:
# entire corpus stats 
print('Total number of books: ' + str(english_fiction_metadata.shape[0]))
print('Books without a subject: ' + str(english_fiction_metadata[english_fiction_metadata['Subject'].isnull()].shape[0]))
print('Books with English as language: ' + str(english_fiction_metadata.loc[english_fiction_metadata['Language'].isin([['English']])].shape[0]))
print('Unique authors: ' + str(len(unique_authors)))

Total number of books: 7487
Books without a subject: 0
Books with English as language: 7487
Unique authors: 1956


## Seed Words

In [18]:
all_seed_words = []

In [19]:
# create smell seed words 
smell_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('smell')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synsets('scent')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.04').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('smell.n.04').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('scent.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('scent.n.01').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('odor.n.02').hypernyms()]

for word in smell_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in smell_seed_words: 
                smell_seed_words.append(value)
                
smell_seed_words = [word for word in smell_seed_words if '_' not in word]    
    
smell_seed_words = list(set(smell_seed_words))

smell_seed_words = [word for word in smell_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + smell_seed_words 

textfile = open("../data/seed_words/smell_words_list.txt", "w")
for element in smell_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(smell_seed_words)

154

In [20]:
# create hear seed words list
hear_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('hear')] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('hear.v.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('hear.v.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('listen.v.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('listen.v.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.v.06').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.v.06').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.01').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.01').hypernyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.02').hyponyms()] + \
                  [synset.name().split('.')[0] for synset in wordnet.synset('sound.n.02').hypernyms()]

for word in hear_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in hear_seed_words: 
                hear_seed_words.append(value)
            
hear_seed_words = [word for word in hear_seed_words if '_' not in word]
            
hear_seed_words = list(set(hear_seed_words))

hear_seed_words = [word for word in hear_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + hear_seed_words 

textfile = open("../data/seed_words/hear_words_list.txt", "w")
for element in hear_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(hear_seed_words)

174

In [21]:
# create touch seed words 
touch_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('touch')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.v.01').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('touch.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('feel.n.03').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('feel.n.03').hypernyms()]

for word in touch_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in touch_seed_words: 
                touch_seed_words.append(value)
                
touch_seed_words = [word for word in touch_seed_words if '_' not in word]
                
touch_seed_words = list(set(touch_seed_words))

touch_seed_words = [word for word in touch_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + touch_seed_words 

textfile = open("../data/seed_words/touch_words_list.txt", "w")
for element in touch_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(touch_seed_words)

192

In [22]:
# create taste seed words 
taste_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('taste')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.03').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('taste.n.03').hypernyms()] +\
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.v.01').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.n.02').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('flavor.n.02').hypernyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('savor.v.04').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('savor.v.04').hypernyms()]

for word in taste_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in taste_seed_words: 
                taste_seed_words.append(value)
                
taste_seed_words = [word for word in taste_seed_words if '_' not in word]                
                
taste_seed_words = list(set(taste_seed_words))

taste_seed_words = [word for word in taste_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + taste_seed_words 

textfile = open("../data/seed_words/taste_words_list.txt", "w")
for element in taste_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(taste_seed_words)

163

In [23]:
# create sight seed words list 
sight_seed_words = [synset.name().split('.')[0] for synset in wordnet.synsets('see')] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('see.v.01').hyponyms()] + \
                   [synset.name().split('.')[0] for synset in wordnet.synset('look.v.01').hyponyms()]
                   

for word in sight_seed_words: 
    variations = get_word_forms(word)
    for key in variations: 
        for value in variations[key]: 
            if value not in sight_seed_words: 
                sight_seed_words.append(value)
                
sight_seed_words = [word for word in sight_seed_words if '_' not in word]

sight_seed_words = list(set(sight_seed_words))

sight_seed_words = [word for word in sight_seed_words if word not in all_seed_words]

all_seed_words = all_seed_words + sight_seed_words 

textfile = open("../data/seed_words/sight_words_list.txt", "w")
for element in sight_seed_words:
    textfile.write(element + "\n")
textfile.close()

len(sight_seed_words)

190

In [24]:
# read seed words 
modalities = ['sight', 'hear', 'touch', 'taste', 'smell']
seed_words = pd.DataFrame(columns = ['word', 'sense_name'])
overlapping_words = []

for sense_name in modalities:
            with open('../data/seed_words/' + sense_name + '_words_list.txt', 'r') as filehandle:
                count = 0 
                for line in filehandle:
                    count += 1
                    currentPlace = line[:-1]
                    if seed_words['word'].str.contains(currentPlace).any():
                        overlapping_words.append(sense_name + '_' + currentPlace)
                    seed_words = seed_words.append({'word' : currentPlace, 'sense_name' : sense_name}, ignore_index = True)
                print(sense_name + ": " + str(count))
print('Total: ' + str(seed_words.shape[0]))

sight: 190
hear: 174
touch: 192
taste: 163
smell: 154
Total: 873


In [25]:
# POS tag words 
def pos_tag_words(word):
    for token in nlp(word):
        return (str(token.text).lower(), str(token.pos_))
    
seed_words['word_pos'] = seed_words.apply(lambda row: pos_tag_words(row['word']), axis = 1)

In [26]:
seed_words.head()

Unnamed: 0,word,sense_name,word_pos
0,starer,sight,"(starer, VERB)"
1,glances,sight,"(glances, VERB)"
2,ogled,sight,"(ogled, VERB)"
3,saw,sight,"(saw, VERB)"
4,sawyer,sight,"(sawyer, NOUN)"


In [27]:
seed_words.to_csv('../data/seed_words/seed_words.csv', index = False)

In [28]:
 with open('../data/seed_words/seed_words.pickle', 'wb') as f:
            pkl.dump(seed_words, f)

In [None]:
with open(input_path, "rb") as f:
    input_obj = pkl.load(f)