# Model #1 Synonym & Library Extraction - Exact Match

In [1]:
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pull in Data From S3 Bucket

In [2]:
# csv file
#df_books_filtered = pd.read_csv('s3://ec2-jupyter-notebook-us-west-2-8c94c42abbd5478ca9a1a477613965a7/books_filtered.csv')
text_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/books_filtered_colsreduced.csv')
print('Total Text DF size:', len(text_data))

model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/tokenized_data.csv')
print('Tokenized Text DF Size:', len(model_data))

Total Text DF size: 29652
Tokenized Text DF Size: 29652


In [3]:
model_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,...,Childrens,New Adult,Fantasy,History,Dystopia,Manga,Thriller,Graphic Novels,Romance,tokens
0,0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,...,,,Fantasy,,Dystopia,,,,Romance,"{'sentence', 'america', 'take', 'life', 'conte..."
1,1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,...,Childrens,,Fantasy,,,,,,,"{'strength', 'lot', 'ordinary', 'end', 'things..."
2,2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,...,,,,,,,,,,"{'published', 'experience', 'roots', 'classicc..."
3,3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,...,,,,,,,,,Romance,"{'austens', 'wit', 'cover', 'austen', 'popular..."
4,4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,...,,,Fantasy,,,,,,Romance,"{'positivefirst', 'himand', 'things', 'part', ..."


## Preprocessing

In [17]:
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [18]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 5min 14s, sys: 22.9 s, total: 5min 37s
Wall time: 5min 37s


In [19]:
model_data['tokens'].head()

0    [winning, means, fame, fortunelosing, means, c...
1    [door, end, silent, corridor, haunting, harry,...
2    [unforgettable, novel, childhood, sleepy, sout...
3    [alternate, cover, edition, isbn, since, immed...
4    [three, things, absolutely, positivefirst, edw...
Name: tokens, dtype: object

In [13]:
#save dataset to csv
model_data.to_csv('tokenized_data.csv')
#download & send to Amber

In [8]:
from rake_nltk import Rake
r = Rake()
keyword_col = []
for d in model_data['description']:
  my_text = d
  r.extract_keywords_from_text(my_text)
  keywordList = []
  rankedList = r.get_ranked_phrases_with_scores()
  for keyword in rankedList:
    keyword_updated = keyword[1].split()
    keyword_updated_string = " ".join(keyword_updated[:2])
    keywordList.append(keyword_updated_string)
    if(len(keywordList)>9):
      break
  keyword_col.append(set(keywordList))
model_data['RAKE'] = keyword_col

In [9]:
model_data.head()

Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,publishDate,...,Childrens,New Adult,Fantasy,History,Dystopia,Manga,Thriller,Graphic Novels,Romance,RAKE
0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,9/14/2008,...,,,Fantasy,,Dystopia,,,,Romance,"{old katniss, without really, shining capitol,..."
1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,9/28/2004,...,Childrens,,Fantasy,,,,,,,"{unbearable sacrifice, silent corridor, haunti..."
2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,5/23/2006,...,,,,,,,,,,"{18 million, young alabama, winning film, slee..."
3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/2000,...,,,,,,,,,Romance,"{romantic clash, regency england, radiant wit,..."
4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,9/6/2006,...,,,Fantasy,,,,,,Romance,"{deeply seductive, part might, love story, ext..."


## Libraries

In [36]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [37]:
#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'moving', 'coming of age', 'death']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship']
marriage_list = create_synsets('wedding') + create_synsets('marriage')

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
marriage_list = [i.replace("_", " ") for i in marriage_list]

synsets = [['college', 'university', 'campus', 'academia'], 
           relationship_list, 
           ['breakup', 'break up', 'split', 'split up', 'broken up', 'dumped'], 
           ['divorce', 'divorced', 'separate', 'separated'], 
           marriage_list, 
           ['move', 'moving'], 
           ['coming of age', 'growing up'], 
           ['death', 'decease', 'deceased', 'dying']]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia]"
1,relationships,"[go steady, go out, date, see, relationship, k..."
2,break ups,"[breakup, break up, split, split up, broken up..."
3,divorce,"[divorce, divorced, separate, separated]"
4,wedding,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,moving,"[move, moving]"
6,coming of age,"[coming of age, growing up]"
7,death,"[death, decease, deceased, dying]"


## Match Tokens to Libraries

In [52]:
def library_match(model_data, library_data):
  for index, row in library_data.iterrows(): #per life event
    LE = row['life_event']
    LE_words = row['synsets']
    LE_list = []
    for tokens in model_data['tokens']: #for each book 
        tokenss = set(tokens)
      #find set intersection (common words)
        matches = tokenss.intersection(LE_words)
        #matches = list(matches)
        LE_list.append(matches)
    model_data[LE] = LE_list

In [53]:
library_match(model_data, df_lib)

In [54]:
model_data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,...,Romance,tokens,university,relationships,break ups,divorce,wedding,moving,coming of age,death
0,0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9.78044E+12,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,...,Romance,"{'sentence', 'america', 'take', 'life', 'conte...",{},{},{},{},{},{},{},{}
1,1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9.78044E+12,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,...,,"{'strength', 'lot', 'ordinary', 'end', 'things...",{},{},{},{},{},{},{},{}
2,2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,1E+13,"['Classics', 'Fiction', 'Historical Fiction', ...",324,...,,"{'published', 'experience', 'roots', 'classicc...",{},{},{},{},{},{},{},{}
3,3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,1E+13,"['Classics', 'Fiction', 'Romance', 'Historical...",279,...,Romance,"{'austens', 'wit', 'cover', 'austen', 'popular...",{},{},{},{},{},{},{},{}
4,4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9.78032E+12,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,...,Romance,"{'positivefirst', 'himand', 'things', 'part', ...",{},{},{},{},{},{},{},{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29647,29647,52473,11492014-fractured,Fractured,Fateful #2,Cheri Schmidt (Goodreads Author),The Fateful Trilogy continues with Fractured. ...,2.94001E+12,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,...,Romance,"{'trilogy', 'continues', 'want', 'destroy', 'd...",{},{},{},{},{},{},{},{}
29648,29648,52474,11836711-anasazi,Anasazi,Sense of Truth #2,Emma Michaels,"'Anasazi', sequel to 'The Thirteenth Chime' by...",1E+13,"['Mystery', 'Young Adult']",190,...,,"{'david', 'desert', 'arrived', 'wants', 'cant'...",{},{},{},{},{},{},{},{}
29649,29649,52475,10815662-marked,Marked,Soul Guardians #1,Kim Richardson (Goodreads Author),--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,9.78146E+12,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,...,Romance,"{'hurtles', 'yearold', 'life', 'david', 'night...",{},{},{},{},{},{},{},{}
29650,29650,52476,11330278-wayward-son,Wayward Son,,"Tom Pollack (Goodreads Author), John Loftus (G...",A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,9.78145E+12,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,...,,"{'suspense', 'relic', 'near', 'vesuvius', 'evi...",{},{},{},{},{},{},{},{}


In [26]:
#filter df to rows with non-empty sets
model_data['university'].value_counts()

{}    29652
Name: university, dtype: int64

In [8]:
print('hello world')

hello world


In [33]:
uni_set = ['college', 'university', 'campus', 'academia']
model_data['tokens_list'] = list(model_data['tokens'])
model_data['tokens_list'].head()

0    [winning, means, fame, fortunelosing, means, c...
1    [door, end, silent, corridor, haunting, harry,...
2    [unforgettable, novel, childhood, sleepy, sout...
3    [alternate, cover, edition, isbn, since, immed...
4    [three, things, absolutely, positivefirst, edw...
Name: tokens_list, dtype: object

In [27]:
model_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,...,Fantasy,History,Dystopia,Manga,Thriller,Graphic Novels,Romance,tokens,tokens_list,university
0,0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,...,Fantasy,,Dystopia,,,,Romance,"[winning, means, fame, fortunelosing, means, c...","{'sentence', 'america', 'take', 'life', 'conte...","[W, I, N, N, I, N, G, , M, E, A, N, S, , F, ..."
1,1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,...,Fantasy,,,,,,,"[door, end, silent, corridor, haunting, harry,...","{'strength', 'lot', 'ordinary', 'end', 'things...","[T, h, e, r, e, , i, s, , a, , d, o, o, r, ..."
2,2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,...,,,,,,,,"[unforgettable, novel, childhood, sleepy, sout...","{'published', 'experience', 'roots', 'classicc...","[T, h, e, , u, n, f, o, r, g, e, t, t, a, b, ..."
3,3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,...,,,,,,,Romance,"[alternate, cover, edition, isbn, since, immed...","{'austens', 'wit', 'cover', 'austen', 'popular...","[A, l, t, e, r, n, a, t, e, , c, o, v, e, r, ..."
4,4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,...,Fantasy,,,,,,Romance,"[three, things, absolutely, positivefirst, edw...","{'positivefirst', 'himand', 'things', 'part', ...","[A, b, o, u, t, , t, h, r, e, e, , t, h, i, ..."


In [34]:
model_data['university'] = model_data['tokens'].apply(lambda x: [item for item in x if item in uni_set])

In [35]:
model_data['university'].value_counts()

[]                                                                    28620
[college]                                                               589
[university]                                                            206
[college, college]                                                       59
[campus]                                                                 41
[university, university]                                                 23
[college, university]                                                    19
[college, campus]                                                        18
[university, college]                                                    14
[university, campus]                                                      6
[campus, campus]                                                          6
[college, college, college]                                               6
[academia]                                                                5
[college, un