# Model #1 Synonym & Library Extraction - Exact Match

In [4]:
pip install 'rake-nltk'

Note: you may need to restart the kernel to use updated packages.


In [5]:
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pull in Data From S3 Bucket

In [6]:
# csv file
df_books_filtered = pd.read_csv('s3://ec2-jupyter-notebook-us-west-2-8c94c42abbd5478ca9a1a477613965a7/books_filtered.csv')
model_data = pd.read_csv('s3://ec2-jupyter-notebook-us-west-2-8c94c42abbd5478ca9a1a477613965a7/books_filtered_colsreduced.csv')

## Preprocessing

In [7]:
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return set(text_preprocessed)

In [None]:
#model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

In [8]:
from rake_nltk import Rake
r = Rake()
keyword_col = []
for d in model_data['description']:
  my_text = d
  r.extract_keywords_from_text(my_text)
  keywordList = []
  rankedList = r.get_ranked_phrases_with_scores()
  for keyword in rankedList:
    keyword_updated = keyword[1].split()
    keyword_updated_string = " ".join(keyword_updated[:2])
    keywordList.append(keyword_updated_string)
    if(len(keywordList)>9):
      break
  keyword_col.append(set(keywordList))
model_data['RAKE'] = keyword_col

In [9]:
model_data.head()

Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,publishDate,...,Childrens,New Adult,Fantasy,History,Dystopia,Manga,Thriller,Graphic Novels,Romance,RAKE
0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780440000000.0,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,9/14/2008,...,,,Fantasy,,Dystopia,,,,Romance,"{old katniss, without really, shining capitol,..."
1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9780440000000.0,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,9/28/2004,...,Childrens,,Fantasy,,,,,,,"{unbearable sacrifice, silent corridor, haunti..."
2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,10000000000000.0,"['Classics', 'Fiction', 'Historical Fiction', ...",324,5/23/2006,...,,,,,,,,,,"{18 million, young alabama, winning film, slee..."
3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,10000000000000.0,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/2000,...,,,,,,,,,Romance,"{romantic clash, regency england, radiant wit,..."
4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9780320000000.0,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,9/6/2006,...,,,Fantasy,,,,,,Romance,"{deeply seductive, part might, love story, ext..."


## Libraries

In [10]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return set(synonym)

In [11]:
#creating library dataframe

# initialize list elements
data = ['College', 'Relationships', 'Divorce', 'Death', 'Wedding', 'Stress', 'Moving', 'Coming of age', 'Adulting']
 
# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(data, columns=['life_event'])
 
# print dataframe.
df_lib

Unnamed: 0,life_event
0,College
1,Relationships
2,Divorce
3,Death
4,Wedding
5,Stress
6,Moving
7,Coming of age
8,Adulting


In [12]:
df_lib['words'] = df_lib['life_event'].apply(lambda x: create_synsets(x))

In [13]:
df_lib

Unnamed: 0,life_event,words
0,College,{college}
1,Relationships,"{family_relationship, relationship, human_rela..."
2,Divorce,"{dissociate, split_up, disunite, divorcement, ..."
3,Death,"{death, last, destruction, end, decease, Death..."
4,Wedding,"{wedding_ceremony, wedding_party, hook_up_with..."
5,Stress,"{punctuate, tenseness, accentuate, stress, ten..."
6,Moving,"{prompt, impress, motivate, move, moving, make..."
7,Coming of age,{}
8,Adulting,{}


## Match Tokens to Libraries

In [16]:
def library_match(model_data, library_data):
  for index, row in library_data.iterrows(): #per life event
    LE = row['life_event']
    LE_words = row['words']
    LE_list = []
    for tokens in model_data['RAKE']: #for each book 
      #find set intersection (common words)
        matches = tokens.intersection(LE_words)
        #matches = list(matches)
        LE_list.append(matches)
    model_data[LE] = LE_list

In [17]:
library_match(model_data, df_lib)

In [18]:
model_data

Unnamed: 0.1,Unnamed: 0,bookId,title,series,author,description,isbn,genres,pages,publishDate,...,RAKE,College,Relationships,Divorce,Death,Wedding,Stress,Moving,Coming of age,Adulting
0,0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9.78044E+12,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",374,9/14/2008,...,"{old katniss, without really, shining capitol,...",{},{},{},{},{},{},{},{},{}
1,1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",There is a door at the end of a silent corrido...,9.78044E+12,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",870,9/28/2004,...,"{unbearable sacrifice, silent corridor, haunti...",{},{},{},{},{},{},{},{},{}
2,2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,1E+13,"['Classics', 'Fiction', 'Historical Fiction', ...",324,5/23/2006,...,"{18 million, young alabama, winning film, slee...",{},{},{},{},{},{},{},{},{}
3,3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",Alternate cover edition of ISBN 9780679783268S...,1E+13,"['Classics', 'Fiction', 'Romance', 'Historical...",279,10/10/2000,...,"{romantic clash, regency england, radiant wit,...",{},{},{},{},{},{},{},{},{}
4,4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,About three things I was absolutely positive.\...,9.78032E+12,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",501,9/6/2006,...,"{deeply seductive, part might, love story, ext...",{},{},{},{},{},{},{},{},{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29647,52473,11492014-fractured,Fractured,Fateful #2,Cheri Schmidt (Goodreads Author),The Fateful Trilogy continues with Fractured. ...,2.94001E+12,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,May 28th 2011,...,"{danielle break, sink deeper, unintentionally ...",{},{},{},{},{},{},{},{},{}
29648,52474,11836711-anasazi,Anasazi,Sense of Truth #2,Emma Michaels,"'Anasazi', sequel to 'The Thirteenth Chime' by...",1E+13,"['Mystery', 'Young Adult']",190,August 5th 2011,...,"{something happened, town claim, everyone want...",{},{},{},{},{},{},{},{},{}
29649,52475,10815662-marked,Marked,Soul Guardians #1,Kim Richardson (Goodreads Author),--READERS FAVORITE AWARDS WINNER 2011--Sixteen...,9.78146E+12,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",280,March 18th 2011,...,"{ordinary life, old kara, guardian angel, -- r...",{},{},{},{},{},{},{},{},{}
29650,52476,11330278-wayward-son,Wayward Son,,"Tom Pollack (Goodreads Author), John Loftus (G...",A POWERFUL TREMOR UNEARTHS AN ANCIENT SECRETBu...,9.78145E+12,"['Fiction', 'Mystery', 'Historical Fiction', '...",507,September 1st 2011,...,"{original wayward, strange relic, notorious bi...",{},{},{},{},{},{},{},{},{}
