# Model #1 Synonym & Library Extraction - Exact Match

In [1]:
import s3fs
import boto3
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ec2-user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pull in Data From S3 Bucket

In [2]:
# csv file
#df_books_filtered = pd.read_csv('s3://ec2-jupyter-notebook-us-west-2-8c94c42abbd5478ca9a1a477613965a7/books_filtered.csv')
#text_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/books_filtered_colsreduced.csv')
#print('Total Text DF size:', len(text_data))

#model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/tokenized_data.csv')
#print('Tokenized Text DF Size:', len(model_data))

In [20]:
model_data = pd.read_csv('LDA_test.csv')
model_data = model_data.drop(columns = ['university', 'relationships', 'break ups', 'divorce', 'weddings', 
               'family', 'friendship', 'death'])
model_data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,bookId,title,series,author,rating,description,...,History,Dystopia,Poetry,Biography,Manga,Thriller,Graphic Novels,Romance,labeled?,Contains True?
0,0,0,0,39822,34838660-not-part-of-the-plan,Not Part of the Plan,Blue Moon #4,Lucy Score (Goodreads Author),4.46,From the Wall Street Journal and #1 Amazon bes...,...,,,,,,,,Romance,Yes,1.0
1,1,1,1,34235,20176552-dragon-age-volume-1,"Dragon Age, Volume 1",Dragon Age Graphic Novels #1-3,"David Gaider, Chad Hardin (Illustrator), Antho...",4.26,Helping set the stage for BioWare's hotly anti...,...,,,,,,,Graphic Novels,,Yes,0.0
2,2,2,2,27904,124110.Dangerous_to_Know,Dangerous to Know,,Barbara Taylor Bradford (Goodreads Author),3.73,"Sebastian Locke, the fifty-six-year-old patria...",...,,,,,,,,Romance,Yes,1.0
3,3,3,3,10515,1046450.The_Wheel_of_Fortune,The Wheel of Fortune,,Susan Howatch,4.11,"""Take me back to Oxmoon, the lost paradise of ...",...,,,,,,,,Romance,Yes,1.0
4,4,4,4,935,872333.Blue_Bloods,Blue Bloods,Blue Bloods #1,Melissa de la Cruz (Goodreads Author),3.69,"When the Mayflower set sail in 1620, it carrie...",...,,,,,,,,Romance,Yes,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,995,17361,588326.The_Blue_Helmet,The Blue Helmet,,William Bell,3.42,Lee wants to be a Tarantula – a member of the ...,...,,,,,,,,,,1.0
996,996,996,996,9029,93007.The_Merry_Adventures_of_Robin_Hood,The Merry Adventures of Robin Hood,,Howard Pyle,4.07,The Merry Adventures of Robin Hood of Great Re...,...,,,,,,,,,,0.0
997,997,997,997,32216,1085376.Before_You_Sleep,Before You Sleep,,"Linn Ullmann, Tiina Nunnally (Translator)",3.34,Moving from present-day Oslo to Brooklyn in th...,...,,,,,,,,,,1.0
998,998,998,998,1036,28195.Inkspell,Inkspell,Inkworld #2,"Cornelia Funke (Goodreads Author), Anthea Bell...",3.91,"The captivating sequel to INKHEART, the critic...",...,,,,,,,,,,0.0


## Preprocessing
Skip this section if using preprocessed dataset

In [21]:
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [22]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 23.6 s, sys: 1.82 s, total: 25.4 s
Wall time: 25.4 s


In [23]:
model_data['tokens'].head()

0    [wall, street, journal, amazon, bestselling, a...
1    [helping, set, stage, biowares, hotly, anticip...
2    [sebastian, locke, fiftysixyearold, patriarch,...
3    [take, back, oxmoon, lost, paradise, childhood...
4    [mayflower, set, sail, carried, board, men, wo...
Name: tokens, dtype: object

In [13]:
#save dataset to csv
#model_data.to_csv('tokenized_data.csv')
#upload dataset to s3 & google drive

## Synsets

In [24]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [25]:
#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'weddings', 
               'family', 'friendship', 'death']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship']
wedding_list = create_synsets('wedding')
family_list = create_synsets('family')
friendship_list = create_synsets('friendship')

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
wedding_list = [i.replace("_", " ") for i in wedding_list]
family_list = [i.replace("_", " ") for i in family_list]
friendship_list = [i.replace("_", " ") for i in friendship_list]

synsets = [['college', 'university', 'campus', 'academia'], 
           relationship_list, 
           ['breakup', 'break up', 'split', 'split up', 'broken up', 'dumped'], 
           ['divorce', 'divorced', 'separate', 'separated'], 
           wedding_list, family_list, friendship_list,
           ['death', 'decease', 'deceased', 'dying']]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia]"
1,relationships,"[go steady, go out, date, see, relationship, k..."
2,break ups,"[breakup, break up, split, split up, broken up..."
3,divorce,"[divorce, divorced, separate, separated]"
4,weddings,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,family,"[family, household, house, home, menage, famil..."
6,friendship,"[friendship, friendly relationship]"
7,death,"[death, decease, deceased, dying]"


## Match Tokens to Libraries

In [26]:
def match_tokens_to_libraries(df_syn, life_event, df_model, token_col):
    
    #match list of synsets to events
    syn_list = df_syn['synsets'][df_syn.life_event==life_event].item()
    #print(syn_list)
    #pull out synset matches
    df_model[life_event] = df_model[token_col].apply(lambda x: [item for item in x if item in syn_list])
    #dedupe events
    #df_model[life_event] = df_model[life_event].apply(lambda x: list(set(x)))
    df_model[life_event] = df_model[life_event].apply(lambda x: bool(x))
    
    return df_model[life_event]
    

In [27]:
#test one example
model_data['university'] = match_tokens_to_libraries(df_lib, 'university', model_data, 'tokens')
model_data['university'].value_counts()

False    973
True      27
Name: university, dtype: int64

In [28]:
#apply function for all life events
life_events = ['university', 'relationships', 'break ups', 'divorce', 'weddings', 
               'family', 'friendship', 'death']

for event in life_events:
    print(event)
    model_data[event] = match_tokens_to_libraries(df_lib, event, model_data, 'tokens')

university
relationships
break ups
divorce
weddings
family
friendship
death


### Check Events

In [29]:
model_data

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,index,Unnamed: 0,bookId,title,series,author,rating,description,...,Contains True?,tokens,university,relationships,break ups,divorce,weddings,family,friendship,death
0,0,0,0,39822,34838660-not-part-of-the-plan,Not Part of the Plan,Blue Moon #4,Lucy Score (Goodreads Author),4.46,From the Wall Street Journal and #1 Amazon bes...,...,1.0,"[wall, street, journal, amazon, bestselling, a...",False,False,False,False,False,True,True,False
1,1,1,1,34235,20176552-dragon-age-volume-1,"Dragon Age, Volume 1",Dragon Age Graphic Novels #1-3,"David Gaider, Chad Hardin (Illustrator), Antho...",4.26,Helping set the stage for BioWare's hotly anti...,...,0.0,"[helping, set, stage, biowares, hotly, anticip...",False,False,False,False,False,False,False,False
2,2,2,2,27904,124110.Dangerous_to_Know,Dangerous to Know,,Barbara Taylor Bradford (Goodreads Author),3.73,"Sebastian Locke, the fifty-six-year-old patria...",...,1.0,"[sebastian, locke, fiftysixyearold, patriarch,...",False,False,False,True,False,True,False,True
3,3,3,3,10515,1046450.The_Wheel_of_Fortune,The Wheel of Fortune,,Susan Howatch,4.11,"""Take me back to Oxmoon, the lost paradise of ...",...,1.0,"[take, back, oxmoon, lost, paradise, childhood...",False,False,False,False,False,True,False,False
4,4,4,4,935,872333.Blue_Bloods,Blue Bloods,Blue Bloods #1,Melissa de la Cruz (Goodreads Author),3.69,"When the Mayflower set sail in 1620, it carrie...",...,0.0,"[mayflower, set, sail, carried, board, men, wo...",False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,995,995,17361,588326.The_Blue_Helmet,The Blue Helmet,,William Bell,3.42,Lee wants to be a Tarantula – a member of the ...,...,1.0,"[lee, wants, tarantula, member, biggest, power...",False,False,False,False,False,False,True,True
996,996,996,996,9029,93007.The_Merry_Adventures_of_Robin_Hood,The Merry Adventures of Robin Hood,,Howard Pyle,4.07,The Merry Adventures of Robin Hood of Great Re...,...,0.0,"[merry, adventures, robin, hood, great, renown...",False,False,False,False,True,False,False,False
997,997,997,997,32216,1085376.Before_You_Sleep,Before You Sleep,,"Linn Ullmann, Tiina Nunnally (Translator)",3.34,Moving from present-day Oslo to Brooklyn in th...,...,1.0,"[moving, presentday, oslo, brooklyn, sleep, te...",False,False,False,False,True,True,False,False
998,998,998,998,1036,28195.Inkspell,Inkspell,Inkworld #2,"Cornelia Funke (Goodreads Author), Anthea Bell...",3.91,"The captivating sequel to INKHEART, the critic...",...,0.0,"[captivating, sequel, inkheart, critically, ac...",False,False,False,False,False,False,False,False


## Next Steps - Model 1
* add bigrams & trigrams to tokens (nltk)
* revisit stopword removal
* remove coming of age category
* restructure moving category
* add family category (familial relationships, likely manual list)
* library optimization
* create programmatic evaluation set / test set
* similarity scoring

## Next Steps - Code-wise
* EDA (Amber)
* Model 2 discussion - Tuesday

## Next Steps - UI

In [30]:
#save results to csv
model_data.to_csv('1000results_model1v2.csv')