# Recommender System for Diversity Personalization

In [None]:
# Import libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

### Data analysis

In [None]:
# Load book data from csv
books = pd.read_csv('../data/books.csv')
books

In [None]:
books.columns

In [None]:
# books = books.drop(columns=['isbn', 'isbn13', 'image_url', 'small_image_url'])
# books

In [None]:
# Load ratings data from csv
ratings = pd.read_csv('../data/ratings.csv')
ratings

In [None]:
# Load book_tags data from csv
book_tags = pd.read_csv('../data/book_tags.csv')
book_tags

In [None]:
# Load tag data from csv
tags = pd.read_csv('../data/tags.csv')
tags

In [None]:
# Merge book_tags and tags 
tags_join = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join

In [None]:
# Sort by book
tags_join.sort_values(by=['goodreads_book_id'])

In [None]:
# Load to_read data from csv. This shows which book a user has tagged as 'to read'.
to_read = pd.read_csv('../data/to_read.csv')
to_read

In [None]:
# Merge tags_join and books
books_with_tags = pd.merge(books, tags_join, left_on='book_id', right_on='goodreads_book_id', how='inner')
books_with_tags

In [None]:
# Display non-truncated columns
pd.reset_option('display.max_colwidth', -1)

In [None]:
# Store tags into the same book id row
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head(5)

In [None]:
# Merge tag_names back into books
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [None]:
books

In [None]:
books.columns

### Feature engineering and machine learning algorithm

* Use TfidVectorizer to transform text to feature vectors
* Use Cosine Similarity to calculate numeric values that denote the similarity between books

In [None]:
# Use TfidVectorizer to transform text to feature vectors
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['tag_name'])

In [None]:
tfidf_matrix

In [None]:
# Use Cosine Similarity to calculate numeric values that denote the similarity between books
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

In [None]:
# Build a 1-dimensional array with book titles
titles = books['title']
tag_name = books['tag_name']
indices = pd.Series(books.index, index=books['title'])

# Function that gets book recommendations based on the cosine similarity score of book tags
def tags_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10] # How many results to display
    book_indices = [i[0] for i in sim_scores]
    title_df = pd.DataFrame({'title': titles.iloc[book_indices].tolist(),
                           'similarity': [i[1] for i in sim_scores],
                            'tag_name': tag_name.iloc[book_indices].tolist()}, # Maybe remove
                           index=book_indices)
    return title_df

# Function that gets book book tags and stats
def recommend_stats(target_book_title):
    
    # Get recommended books
    rec_df = tags_recommendations(target_book_title)
    
    # Get tags of the target book
    rec_book_tags = books_with_tags[books_with_tags['title'] == target_book_title]['tag_name'].to_list()
    
    # Create dictionary of tag lists by book title
    book_tag_dict = {}
    for title in rec_df['title'].tolist():
        book_tag_dict[title] = books_with_tags[books_with_tags['title'] == title]['tag_name'].to_list()
    
    # Create dictionary of tag statistics by book title
    tags_stats = {}
    for book, tags in book_tag_dict.items():
        tags_stats[book] = {}
        tags_stats[book]['total_tags'] = len(tags)
        same_tags = set(rec_book_tags).intersection(set(tags)) # Get tags in recommended book that are also in target book
        tags_stats[book]['%_common_tags'] = (len(same_tags) / len(tags)) * 100
    
    # Convert dictionary to dataframe
    tags_stats_df = pd.DataFrame.from_dict(tags_stats, orient='index').reset_index().rename(columns={'index': 'title'})
    
    # Merge tag statistics dataframe to recommended books dataframe
    all_stats_df = pd.merge(rec_df, tags_stats_df, on='title')
    return all_stats_df

In [None]:
# Displays all rows without truncating
# pd.set_option('display.max_rows', None)

# Display all columns with/without truncating (use "set" or "reset")
pd.reset_option('display.max_colwidth')

In [None]:
lor_recs = recommend_stats('The Fellowship of the Ring (The Lord of the Rings, #1)')

In [None]:
lor_recs

In [None]:
lor_recs.describe()

In [None]:
lor_recs.boxplot(column=['similarity'])
plt.show()

In [None]:
lor_recs.boxplot(column=['%_common_tags'])
plt.show()

### Reverse engineer

In [None]:
# Select rows with similarity values between 0.035 to 0.045 and % common tags between 20 to 25
# Zero to One: similarity 0.040449, % common tags 23

# lor_recs_filter = lor_recs.loc[(lor_recs['similarity'] >= 0.010575) & (lor_recs['similarity'] <= 0.0.045) & (lor_recs['%_common_tags'] >= 4) & (lor_recs['%_common_tags'] <= 25)]
# lor_recs_filter

In [None]:
# Merge recommendations with ratings
recs_merge = pd.merge(books, lor_recs, left_on='title', right_on='title', how='inner')
recs_merge

In [None]:
recs_merge = recs_merge.drop(columns=['best_book_id', 'work_id', 'isbn', 'isbn13', 'image_url', 'small_image_url', 'tag_name_y'])
recs_merge

In [None]:
# Filter tags unlike most frequent book tag (in this case: 'non-fiction', 'economics' and 'entrepreneurial')
lor_recs_filter = recs_merge[(recs_merge['tag_name_x'].str.contains('non-fiction')) & (recs_merge['tag_name_x'].str.contains('economics')) & (recs_merge['tag_name_x'].str.contains('entrepreneurial'))] 

In [None]:
# pd.reset_option('display.max_colwidth')
pd.set_option('display.max_rows', 400)
lor_recs_filter = lor_recs_filter.sort_values(by=['average_rating'], ascending=False)
lor_recs_filter

### Explore the book data

In [None]:
# Book tags for Lord of the Rings
lord_of_the_rings_tags = pd.DataFrame(books_with_tags[books_with_tags['title']=='The Fellowship of the Ring (The Lord of the Rings, #1)']['tag_name'])
lord_of_the_rings_tags

In [None]:
# Find most frequent word used in tags
import nltk

top_N = 100
txt = lord_of_the_rings_tags.tag_name.str.lower().str.replace(r'-', ' ').str.cat(sep=' ') # Remove hyphens
words = nltk.tokenize.word_tokenize(txt)
word_dist = nltk.FreqDist(words)

stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords) 
print('All frequencies, including STOPWORDS:')
print('=' * 60)
lor_rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
print(lor_rslt)
print('=' * 60)
lor_rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')
matplotlib.style.use('ggplot')

lor_rslt.plot.bar(rot=0)
plt.show()

In [None]:
zero_rslt

In [None]:
top_N = 50

txt = lor_recs.tag_name.str.lower().str.replace(r'-', ' ').str.cat(sep=' ') # Remove hyphens
words = nltk.tokenize.word_tokenize(txt)
word_dist = nltk.FreqDist(words)

stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords) 

print('All frequencies, including STOPWORDS:')
print('=' * 60)
rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
print(rslt)
print('=' * 60)

rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')

matplotlib.style.use('ggplot')

rslt.plot.bar(rot=0)
plt.show()

In [None]:
lord_of_the_rings_tags.tag_name.mode()

In [None]:
# Book tags for Zero to One
zero_to_one_tags = pd.DataFrame(books_with_tags[books_with_tags['title']=='Zero to One: Notes on Startups, or How to Build the Future']['tag_name'])
zero_to_one_tags

In [None]:
# Find most frequent word used in tags
import nltk

top_N = 100
txt = zero_to_one_tags.tag_name.str.lower().str.replace(r'-', ' ').str.cat(sep=' ') # Remove hyphens
words = nltk.tokenize.word_tokenize(txt)
word_dist = nltk.FreqDist(words)

stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords) 
print('All frequencies, including STOPWORDS:')
print('=' * 60)
zero_rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])
print(zero_rslt)
print('=' * 60)
zero_rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')
matplotlib.style.use('ggplot')

zero_rslt.plot.bar(rot=0)
plt.show()

In [None]:
# Concat 

df2_concat = pd.concat([lor_rslt,zero_rslt])
df2_concat

In [None]:
# Book tag for other  book
startup_tags = pd.DataFrame(books_with_tags[books_with_tags['title']=='Bossypants']['tag_name'])
startup_tags

In [None]:
# Check commonalities and differences within each
# False = different
# True = common
lord_of_the_rings_tags['tag_name'].isin(zero_to_one_tags['tag_name']).value_counts(normalize=True)

In [None]:
# Concat the two dataframes

df_concat = pd.concat([lord_of_the_rings_tags,zero_to_one_tags])
df_concat

In [None]:
df_concat[df_concat.duplicated(['tag_name'], keep=False)]

In [None]:
# Check commonalities and differences within each
# False = different
# True = common
lord_of_the_rings_tags['tag_name'].isin(startup_tags['tag_name']).value_counts(normalize=True)

In [None]:
# Concatenate the two dataframes
df_diff = pd.concat([lord_of_the_rings_tags,zero_to_one_tags]).drop_duplicates(keep=False)
pd.set_option('display.max_rows', 160)
df_diff

In [None]:
df_diff = pd.concat([lord_of_the_rings_tags, zero_to_one_tags]).loc[lord_of_the_rings_tags.index.symmetric_difference(zero_to_one_tags.index)]
df_diff.tail(60)

### Find opposite of a word

In [None]:
import nltk 
from nltk.corpus import wordnet 

In [None]:
synonyms = [] 
antonyms = [] 
  
for syn in wordnet.synsets('nice'): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
print(set(antonyms))

In [None]:
#Checking antonym for the word "increase"
from nltk.corpus import wordnet
antonyms = []
for syn in wordnet.synsets("book"):
    for lm in syn.lemmas():
        if lm.antonyms():
            antonyms.append(lm.antonyms()[0].name()) #adding into antonyms
print(set(antonyms))

### Filter for opposite of most frequent tag

In [None]:
# Displays all rows without truncating
pd.set_option('display.max_rows', None)
# pd.reset_option('display.max_rows', None)

pd.DataFrame(books_with_tags[books_with_tags['title']=='The Complete Guide to Middle-Earth']['tag_name'])

In [None]:
lor_recs_filter_business = lor_recs[lor_recs['tag_name'].str.contains("nonfiction")]
lor_recs_filter_business

In [None]:
newdf = lor_recs[lor_recs['tag_name'] != 'fantasy'