<a href="https://colab.research.google.com/github/banned-books/project_banned_books/blob/main/appendix/nmf_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Banned Book Metadata (Goodreads & PEN/American Library Association)

### Import Libraries 

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt

In [None]:
from numpy.linalg import norm
from nltk.corpus import stopwords
from warnings import simplefilter
from IPython.core.display import HTML
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

### Model and Parameters 
As we have mentioned before, this file is dedicated to creating a nmf recommender system using the goodreads & pen/American library association data. We have decided to create a nmf model with 8 topics. 


In [None]:
# create a list of English stopwords
stop_words = stopwords.words('english')
#Future warning 
simplefilter(action='ignore', category=FutureWarning)
# select number of topic clusters
n_topics = 8
# create an NMF instance
nmf = NMF(n_components=n_topics)

## Data Loading

In [None]:
df = pd.read_csv('banned_books.csv')

In [None]:
df.head(3)

Unnamed: 0,goodreads_image_url,title,author,goodreads_published_date,goodreads_description,goodreads_tags,type_of_ban,state,district,ban_date,origin_of_challenge,goodreads_product_url,amazon_url,secondary_authors,illustrators,translators
0,https://images-na.ssl-images-amazon.com/images...,Ace of Spades,"Àbíké-Íyímídé, Faridah",2021-06-01,An incendiary and utterly compelling thriller ...,"dark, lgbtqia+, black, young adult, gay romanc...",Banned in Libraries and Classrooms,Florida,Indian River County School District,November 2021,Administrator,https://www.goodreads.com/book/show/42603984-a...,https://www.amazon.com/Ace-Spades-Faridah-Abik...,,,
1,https://images-na.ssl-images-amazon.com/images...,Clap When You Land,"Acevedo, Elizabeth",2020-05-05,In a novel in verse that brims with grief and ...,"young adult, poetry, contemporary, fiction, au...",Banned in Classrooms,Pennsylvania,Central York School District,September 2021,Administrator,https://www.goodreads.com/book/show/52516332-c...,https://www.amazon.com/Clap-When-Land-Elizabet...,,,
2,https://images-na.ssl-images-amazon.com/images...,The Poet X,"Acevedo, Elizabeth",2018-03-06,Fans of Jacqueline Woodson Meg Medina and Ja...,"poetry, young adult, contemporary, fiction, au...",Banned in Libraries,Florida,Indian River County School District,November 2021,Administrator,https://www.goodreads.com/book/show/33294200-t...,https://www.amazon.com/Poet-X-Elizabeth-Aceved...,,,


### Text Pre-Processing

In [None]:
text_df = pd.DataFrame(columns = ['title'], data = df.title)
text_df['description'] = df['goodreads_description']+df['goodreads_tags']
text_df = text_df.drop_duplicates()

### Vectorization 

In [None]:
%time v = TfidfVectorizer(stop_words=stop_words, ngram_range=(2, 2), use_idf=True)
%time V = v.fit_transform(text_df['description'])

df_tf = pd.DataFrame(V.toarray(),
                     columns=v.vocabulary_,
                     index=text_df['title'])
#display(df_tf.head())

CPU times: user 19 µs, sys: 8 µs, total: 27 µs
Wall time: 27.9 µs
CPU times: user 323 ms, sys: 14.5 ms, total: 338 ms
Wall time: 337 ms


In [None]:
df_tf

Unnamed: 0_level_0,incendiary utterly,utterly compelling,compelling thriller,thriller shocking,shocking twist,twist delves,delves deep,deep heart,heart institutionalized,institutionalized racism,...,ready eat,eat birthday,birthday present,present special,special surprise,surprise waiting,waiting front,door picture,food africa,cooking realistic
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ace of Spades,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Clap When You Land,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Poet X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Poet X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Call Me By Your Name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Garden of My Imaan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Read the World: Rethinking Literacy for Empathy and Action in a Digital Age,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Black Enough: Stories of Being Young & Black in America,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
idf_df = pd.DataFrame(
    v.idf_, 
    index=v.vocabulary_,
    columns=['idf_weigths']
    )
    
idf_df.sort_values(by=['idf_weigths']).head(10)

#The greater the IDF, the more relevant it is to an article
#1 mention out of 1 articles = log(1/1) = 0.0
#1 mention out of 2 articles = log(2/1) = 0.69
#1 mention out of 10 articles = log(10/1) = 2.30
#1 mention out of 100 articles = log(100/1) = 4.61

Unnamed: 0,idf_weigths
ramadan wear,1.712079
price finally,1.931837
frustration graceful,2.558145
feyre endeavors,2.933884
expect ben,2.985397
rehearsals environmentally,2.994247
america days,3.200861
problem anderson,3.20634
professional passionate,3.257056
count every,3.354101


In [None]:
# Fit the model to the tf_idf
%time nmf_features = nmf.fit_transform(V)

# normalize the features
%time norm_features = normalize(nmf_features)

CPU times: user 1.54 s, sys: 2.91 s, total: 4.45 s
Wall time: 827 ms
CPU times: user 535 µs, sys: 1.89 ms, total: 2.42 ms
Wall time: 498 µs


In [None]:
# Compare processed VS unprocessed dataframes
print('Original df: ', df_tf.shape)
print('NMF Processed df: ', nmf.components_.shape)

Original df:  (1652, 109103)
NMF Processed df:  (8, 109103)


### Utility Functions

In [None]:
#Input: df = Grouped Pandas Dataframe , show:default 8 (int) = clusters to show
#Output: dict of "show" n clusters->  {'q'(query): string , 'sim_score'(similarly score): float}
def create_clusters(df, show=8):
    clusters = {}
    for i in range(len(df)):
        clusters[i] = []
        loop = dict(df.loc[i,:].nlargest(show)).items()
        for k,v in loop:
            clusters[i].append({'q':k[0],'sim_score': v})
    return clusters

In [None]:
#Input: Grouped Pandas Dataframe
#Output: Pandas Dataframe -> index: topics, rows: clusters 
def show_queries(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: x['q'])
    return df

In [None]:
#Input: A= Array, B= Array
#Output: float; computational method 
def cosine_metric(A,B):
    return np.dot(A,B)/(norm(A)*norm(B))

In [None]:
#Input: Normalized Dataframe
#Output: Dictionary mapping for similarity scores 
def create_recommender(norm_df):
    recomender = {}
    for x in range(len(norm_df)):
        match = norm_df.index[x]
        recomender_scores =[(norm_df.index[y],cosine_metric(norm_df.iloc[x],norm_df.iloc[y])) for y in range(len(norm_df))]
        temp = {match: recomender_scores}
        recomender.update(temp)
    return recomender

In [None]:
#Input: book_name= string , recomender = dictionary, num_recomendations= number of recomendations 
#Output: tuple (recomender book(s), similarity score)
def get_top_N_recomended(book_name,recomender=dict, num_recomendations=3):
    rec = sorted(recomender.get(book_name), key = lambda x: x[1])[::-1]
    for r in rec:
        if r[0] == book_name:
            rec.remove(r)
    return rec[:num_recomendations]

### Clustering Data

In [None]:
# Create clustered dataframe the NMF clustered df
components = pd.DataFrame(
    nmf.components_, 
    columns=[df_tf.columns]
    ) 
components

Unnamed: 0,incendiary utterly,utterly compelling,compelling thriller,thriller shocking,shocking twist,twist delves,delves deep,deep heart,heart institutionalized,institutionalized racism,...,ready eat,eat birthday,birthday present,present special,special surprise,surprise waiting,waiting front,door picture,food africa,cooking realistic
0,9.1e-05,0.000172,8.1e-05,9.1e-05,0.0,9.2e-05,1.3e-05,0.00018,2.8e-05,9.1e-05,...,0.001658,0.001658,0.001658,0.001658,0.001361,0.000143,0.000287,0.000143,0.001439,0.001361
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000107,0.000447,0.001714,0.000107,0.008867,0.000126,0.000104,0.0,4.9e-05,0.000107,...,0.0,0.0,0.0,0.0,0.0,0.003922,0.007843,0.003922,4.6e-05,0.0
3,1e-05,0.0,0.0,1e-05,0.0,1.4e-05,0.0,0.0,0.0,1e-05,...,4.7e-05,4.7e-05,4.7e-05,4.7e-05,0.0,0.00054,0.001079,0.00054,0.000563,0.0
4,6.6e-05,0.0,0.0,6.6e-05,0.0,0.0,0.0,0.0,0.0,6.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000614,0.001228,0.000614,0.0,0.0
6,1.1e-05,0.000272,0.0,1.1e-05,0.0,4e-06,7.3e-05,0.002665,0.0,1.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,1.2e-05,0.0,1e-05,0.0,0.0,...,0.000296,0.000296,0.000296,0.000296,0.0,4.3e-05,8.7e-05,4.3e-05,7.2e-05,0.0


In [None]:
# Create clusters
clusters = create_clusters(components) 
clusters

{0: [{'q': 'ramadan wear', 'sim_score': 0.7298361975396181},
  {'q': 'rehearsals environmentally', 'sim_score': 0.522999063426831},
  {'q': 'price finally', 'sim_score': 0.3906209950631869},
  {'q': 'problem anderson', 'sim_score': 0.24310019336440544},
  {'q': 'professional passionate', 'sim_score': 0.22691454747548961},
  {'q': 'question colorful', 'sim_score': 0.21368979488844908},
  {'q': 'grade fiction', 'sim_score': 0.20762070877374222},
  {'q': 'together love', 'sim_score': 0.20679559999943925}],
 1: [{'q': 'life city', 'sim_score': 0.6295589197363509},
  {'q': 'weathers move', 'sim_score': 0.5078407076042809},
  {'q': 'talk home', 'sim_score': 0.46885796349986353},
  {'q': 'miss tillie', 'sim_score': 0.32915440725277934},
  {'q': 'whether growing', 'sim_score': 0.2776032817755634},
  {'q': 'lit erotic', 'sim_score': 0.22753326657194264},
  {'q': 'judge asks', 'sim_score': 0.22753326657194264},
  {'q': 'traces someone', 'sim_score': 0.22753326657194264}],
 2: [{'q': 'frustration

In [None]:
# Create dataframe using the clustered dict
grouping = pd.DataFrame(clusters).T
grouping['topic'] = grouping[0].apply(lambda x: x['q'])
grouping.drop(0, axis=1, inplace=True)
grouping.set_index('topic', inplace=True)
grouping.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ramadan wear,"{'q': 'rehearsals environmentally', 'sim_score...","{'q': 'price finally', 'sim_score': 0.39062099...","{'q': 'problem anderson', 'sim_score': 0.24310...","{'q': 'professional passionate', 'sim_score': ...","{'q': 'question colorful', 'sim_score': 0.2136...","{'q': 'grade fiction', 'sim_score': 0.20762070...","{'q': 'together love', 'sim_score': 0.20679559..."
life city,"{'q': 'weathers move', 'sim_score': 0.50784070...","{'q': 'talk home', 'sim_score': 0.468857963499...","{'q': 'miss tillie', 'sim_score': 0.3291544072...","{'q': 'whether growing', 'sim_score': 0.277603...","{'q': 'lit erotic', 'sim_score': 0.22753326657...","{'q': 'judge asks', 'sim_score': 0.22753326657...","{'q': 'traces someone', 'sim_score': 0.2275332..."
frustration graceful,"{'q': 'expect ben', 'sim_score': 0.47210794366...","{'q': 'ceiling defied', 'sim_score': 0.3543152...","{'q': 'price finally', 'sim_score': 0.21718416...","{'q': 'destroy life', 'sim_score': 0.184126283...","{'q': 'hold frustration', 'sim_score': 0.16317...","{'q': 'viewpoints many', 'sim_score': 0.146521...","{'q': 'historical historical', 'sim_score': 0...."
stardust dead,"{'q': 'author recent', 'sim_score': 0.26645846...","{'q': 'carla espana', 'sim_score': 0.266458462...","{'q': 'locks open', 'sim_score': 0.26645846205...","{'q': 'expectancy effects', 'sim_score': 0.266...","{'q': 'australia high', 'sim_score': 0.2604519...","{'q': 'amy diary', 'sim_score': 0.245688539527...","{'q': 'adam rapp', 'sim_score': 0.150680498711..."
characters reveal,"{'q': 'distracts mother', 'sim_score': 0.29316...","{'q': 'daruma doll', 'sim_score': 0.2655485951...","{'q': 'feyre endeavors', 'sim_score': 0.183991...","{'q': 'students never', 'sim_score': 0.1465802...","{'q': 'lgbtqia gay', 'sim_score': 0.1465802584...","{'q': 'gay fiction', 'sim_score': 0.1465802584...","{'q': 'discovery alek', 'sim_score': 0.1465802..."


In [None]:
#transform the grouping 
clustered_queries = show_queries(grouping)
clustered_queries.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ramadan wear,rehearsals environmentally,price finally,problem anderson,professional passionate,question colorful,grade fiction,together love
life city,weathers move,talk home,miss tillie,whether growing,lit erotic,judge asks,traces someone
frustration graceful,expect ben,ceiling defied,price finally,destroy life,hold frustration,viewpoints many,historical historical
stardust dead,author recent,carla espana,locks open,expectancy effects,australia high,amy diary,adam rapp
characters reveal,distracts mother,daruma doll,feyre endeavors,students never,lgbtqia gay,gay fiction,discovery alek


In [None]:
#remove duplicates 
pages = text_df.title
norm_df = pd.DataFrame(norm_features, index=pages)
norm_df = norm_df[~norm_df.index.duplicated(keep='first')]
norm_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ace of Spades,0.98014,0.0,0.0,0.0,0.0,0.088844,0.177289,0.0
Clap When You Land,0.926485,0.0,0.137706,0.347262,0.04534,0.0,0.0,0.004027
The Poet X,0.997919,0.0,0.0,0.064478,0.0,0.0,0.0,0.0
Call Me By Your Name,0.980933,0.01476,0.192846,0.0,0.0,0.0,0.01904,0.0
"How I Paid for College: A Novel of Sex, Theft, Friendship & Musical Theater",0.987106,0.0,0.010796,0.0,0.0,0.0,0.159702,0.0


### Recommender 

In [None]:
recomender = create_recommender(norm_df)

In [None]:
get_top_N_recomended(book_name ='The Poet X',recomender=recomender)

[("What If It's Us ", 0.9998103993037227),
 ('The Grief Keeper', 0.9995477184832957),
 ('Putting Makeup on the Fat Boy', 0.9995270966161882)]

In [None]:
get_top_N_recomended(book_name ='Call Me By Your Name',recomender=recomender)

[('Gravity', 0.9998317576470865),
 ('7 Days at the Hot Corner', 0.9997877045840184),
 ('October Mourning: A Song for Matthew Shepard', 0.9996650455537834)]

In [None]:
get_top_N_recomended(book_name ='How I Paid for College: A Novel of Sex, Theft, Friendship & Musical Theater',recomender=recomender)

[('Love Drugged', 0.9999974802663153),
 ('The Art of Being Normal', 0.9999416543533003),
 ('The Extraordinaries ', 0.9999396138049264)]

In [None]:
df[df['title'] == "The Lady's Guide to Petticoats and Piracy "]["goodreads_description"].to_string(index=False)

'A year after an accidentally whirlwind grand to...'

In [None]:
df["goodreads_description"][0]

'An incendiary and utterly compelling thriller with a shocking twist that delves deep into the heart of institutionalized racism  from an exceptional new YA voice   Welcome to Niveus Private Academy  where money paves the hallways  and the students are never less than perfect  Until now  Because anonymous texter  Aces  is bringing two students  dark secrets to light   Talented musician Devon buries himself in rehearsals  but he can t escape the spotlight when his private photos go public  Head girl Chiamaka isn t afraid to get what she wants  but soon everyone will know the price she has paid for power   Someone is out to get them both  Someone who holds all the aces  And they re planning much more than a high school game   '