# Model #1 Synonym & Library Extraction - Exact Match

In [1]:
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pull in Data From S3 Bucket

In [2]:
# csv file
#df_books_filtered = pd.read_csv('s3://ec2-jupyter-notebook-us-west-2-8c94c42abbd5478ca9a1a477613965a7/books_filtered.csv')
text_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/books_filtered_colsreduced.csv')
print('Total Text DF size:', len(text_data))

model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/tokenized_data.csv')
print('Tokenized Text DF Size:', len(model_data))

Total Text DF size: 29652
Tokenized Text DF Size: 29652


In [3]:
model_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,...,Childrens,New Adult,Fantasy,History,Dystopia,Manga,Thriller,Graphic Novels,Romance,tokens
0,0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,...,,,Fantasy,,Dystopia,,,,Romance,"{'sentence', 'america', 'take', 'life', 'conte..."
1,1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,...,Childrens,,Fantasy,,,,,,,"{'strength', 'lot', 'ordinary', 'end', 'things..."
2,2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,...,,,,,,,,,,"{'published', 'experience', 'roots', 'classicc..."
3,3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,...,,,,,,,,,Romance,"{'austens', 'wit', 'cover', 'austen', 'popular..."
4,4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,...,,,Fantasy,,,,,,Romance,"{'positivefirst', 'himand', 'things', 'part', ..."


In [4]:
#sample df
#df_samp = model_data.sample(n=100)
#df_samp.to_csv('100_random_sample.csv')

## Preprocessing
Skip this section if using preprocessed dataset

In [5]:
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [6]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 5min 13s, sys: 23.4 s, total: 5min 36s
Wall time: 5min 36s


In [7]:
model_data['tokens'].head()

0    [winning, means, fame, fortunelosing, means, c...
1    [door, end, silent, corridor, haunting, harry,...
2    [unforgettable, novel, childhood, sleepy, sout...
3    [alternate, cover, edition, isbn, since, immed...
4    [three, things, absolutely, positivefirst, edw...
Name: tokens, dtype: object

In [8]:
#save dataset to csv
model_data.to_csv('tokenized_data.csv')
#upload dataset to s3 & google drive

## Synsets

In [9]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [10]:
#creating library dataframe

#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship', 'marriage']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship', 'romance', 'dating']
marriage_list = create_synsets('marriage')
wedding_list = create_synsets('wedding') + ['matrimony']

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
marriage_list = [i.replace("_", " ") for i in marriage_list]
wedding_list = [i.replace("_", " ") for i in wedding_list]

#remove certain words
wedding_list.remove('tie')
wedding_list.remove('marriage')
relationship_list.remove('see')

synsets = [['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors'], 
           relationship_list, 
           ['breakup', 'break up', 'split up', 'broken up', 'dumped', 'breaks up', 'splits up', 'dumps', 'dump', 'breaks off', 'break off'], 
           ['divorce', 'divorced', 'divorces'], 
           wedding_list,  
           ['death', 'decease', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships'],
           marriage_list]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia, profes..."
1,relationships,"[go steady, go out, date, relationship, kinshi..."
2,break ups,"[breakup, break up, split up, broken up, dumpe..."
3,divorce,"[divorce, divorced, divorces]"
4,wedding,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,death,"[death, decease, deceased, dying]"
6,family,"[family, mother, father, brother, sister, mom,..."
7,friendship,"[friends, friend, friendship, friendships]"
8,marriage,"[marriage, matrimony, union, spousal relations..."


## Match Tokens to Libraries

In [11]:
def match_tokens_to_libraries(df_syn, life_event, df_model, token_col):
    
    #match list of synsets to events
    syn_list = df_syn['synsets'][df_syn.life_event==life_event].item()
    print(syn_list)
    #pull out synset matches
    df_model[life_event] = df_model[token_col].apply(lambda x: [item for item in x if item in syn_list])
    #dedupe events
    df_model[life_event] = df_model[life_event].apply(lambda x: list(set(x)))
    
    return df_model[life_event]
    

In [12]:
#test one example
model_data['university'] = match_tokens_to_libraries(df_lib, 'university', model_data, 'tokens')
model_data['university'].value_counts()

['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors']


[]                                                 28407
[college]                                            611
[university]                                         196
[professor]                                          169
[campus]                                              46
[college, university]                                 37
[college, professor]                                  36
[professor, university]                               32
[college, campus]                                     28
[professors]                                          12
[professor, professors]                               11
[universities]                                         9
[colleges]                                             8
[college, professor, university]                       6
[university, campus]                                   6
[college, university, campus]                          6
[colleges, college]                                    4
[colleges, professor]          

In [13]:
#apply function for all life events
life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship']

for event in life_events:
    print(event)
    model_data[event] = match_tokens_to_libraries(df_lib, event, model_data, 'tokens')

university
['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors']
relationships
['go steady', 'go out', 'date', 'relationship', 'kinship', 'romance', 'dating']
break ups
['breakup', 'break up', 'split up', 'broken up', 'dumped', 'breaks up', 'splits up', 'dumps', 'dump', 'breaks off', 'break off']
divorce
['divorce', 'divorced', 'divorces']
wedding
['wedding', 'wedding ceremony', 'nuptials', 'hymeneals', 'wedding', 'marriage ceremony', 'wedding', 'wedding party', 'marry', 'get married', 'wed', 'conjoin', 'hook up with', 'get hitched with', 'espouse', 'marry', 'wed', 'splice', 'matrimony']
death
['death', 'decease', 'deceased', 'dying']
family
['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad']
friendship
['friends', 'friend', 'friendship', 'friendships']


### Check Events

In [14]:
model_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,...,Romance,tokens,university,relationships,break ups,divorce,wedding,death,family,friendship
0,0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,...,Romance,"[winning, means, fame, fortunelosing, means, c...",[],[],[],[],[],[death],[],[]
1,1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,...,,"[door, end, silent, corridor, haunting, harry,...",[],[],[],[],[],[],[],[friends]
2,2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,...,,"[unforgettable, novel, childhood, sleepy, sout...",[],[],[],[],[],[],[],[]
3,3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,...,Romance,"[alternate, cover, edition, isbn, since, immed...",[],[],[],[],[],[],[],[]
4,4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,...,Romance,"[three, things, absolutely, positivefirst, edw...",[],[],[],[],[],[],[],[]


## Next Steps - Model 1
* add bigrams & trigrams to tokens (nltk)
* revisit stopword removal
* remove coming of age category
* restructure moving category
* add family category (familial relationships, likely manual list)
* library optimization
* create programmatic evaluation set / test set
* similarity scoring

## Next Steps - Code-wise
* EDA (Amber)
* Model 2 discussion - Tuesday

## Next Steps - UI

In [17]:
#save results to csv
model_data.to_csv('results_model1v1.csv')

## Evaluation

In [15]:
#import eval set from s3
test_set = pd.read_csv('s3://book-data-ucb-capstone-s2022/100_random_sample - 100_random_sample.csv')

#reduce test set cols
test_set = test_set[['bookId', 'Life Event Categories', 'University_T', 'Relationships_T',
       'Break_ups_T', 'Divorce_T', 'Wedding_T', 'Moving_T', 'Coming_of_age_T',
       'Death_T', 'Family_T', 'Friendship_T']]

#join model results on test set
df_test_results = pd.merge(model_data, test_set, on='bookId', how='inner')
print(len(df_test_results))

100


In [16]:
df_test_results.to_csv('test_results_model1v1.csv')