In [27]:
import pandas as pd
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# LIMITS TO FIRST 1000 ROWS FOR DEMONSTRATION PURPOSES
df = pd.read_csv('betterreads.csv', delimiter='\t', nrows=1000)


In [6]:
# LOAD SPACY
nlp = spacy.load('en_core_web_md')

In [7]:
# ADDING CUSTOM STOP WORDS
STOP_WORDS = ['book']
STOP_WORDS = nlp.Defaults.stop_words.union(STOP_WORDS)

In [9]:
def tokenize(text):
    '''
    Input: String
    Output: list of tokens
    '''

    doc = nlp(text)
    tokens = []
    for token in doc:
        if ((token.text.lower() not in STOP_WORDS) & 
            (token.is_punct == False) & 
            (token.pos_ != 'PRON') & 
            (token.is_alpha == True)):
            tokens.append(token.lemma_.lower())
    return tokens

In [10]:
# THIS SHOWS OFF OUR `TOKENIZE` FUNCTION
# BUT PROBABLY BETTER TO CALL IT WITHIN TFIDF BELOW
df['tokens'] = df['description'].apply(tokenize)

In [11]:
df.head()

Unnamed: 0,title,authors_name,description,average_rating,tokens
0,On the Jellicoe Road,Melina Marchetta,I'm dreaming of the boy in the tree. I tell hi...,4.14,"[dream, boy, tree, tell, story, jellicoe, scho..."
1,"I've Got You Under My Skin (Under Suspicion, #1)",Alafair Burke,When Laurie Moran's husband was brutally murde...,3.75,"[laurie, moran, husband, brutally, murder, yea..."
2,The Orange Girl,James Anderson,'My father died eleven years ago. I was only f...,3.91,"[father, die, year, ago, think, hear, write, g..."
3,The Informers,Bret Easton Ellis,"Set in Los Angeles, in the recent past. The bi...",3.39,"[set, los, angeles, recent, past, birthplace, ..."
4,"A Year Down Yonder (A Long Way from Chicago, #2)",Richard Peck,Mary Alice remembers childhood summers packed ...,4.1,"[mary, alice, remember, childhood, summer, pac..."


In [12]:
# NOT NEEDED AS TfidfVectorizer can apply our custom tokenize function
# df['tokens_string'] = df['tokens'].apply(lambda x: ' '.join(x))

In [16]:
tfidf = TfidfVectorizer(tokenizer=tokenize)

In [19]:
dtm = tfidf.fit_transform(df['description'])

In [23]:
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [26]:
dtm.head()

Unnamed: 0,-pron-,aaa,aaron,ab,abandon,abandonment,abandonne,abbey,abby,abduct,...,ﻭﺫﻛﺮﻳﺎﺕ,ﻭﻟﻜﻨﻬﺎ,ﻭﻫﻨﺎ,ﻭﻳﺬﻫﺐ,ﻳﺄﺳﻬﺎ,ﻳﺘﺒﺨﺮ,ﻳﺼﻒ,ﻳﺼﻴﺐ,ﻳﻜﻮﻥ,ﻻ
0,0.0,0.0,0.0,0.0,0.075305,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
nn = NearestNeighbors(n_neighbors=10, algorithm='auto')
nn.fit(dtm)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [57]:
# OUR SAMPLE TEXT NEEDS TO BE INSIDE A LIST
nineteen_eighty_four = ['''
Ender Wiggin, the third in a family of child geniuses, is selected by international military forces to save the world from destruction. Before being chosen Ender wears a unique monitor that allows the heads of the military to see things as Ender does. Ender's brother Peter and his sister Valentine also wore this monitor, although neither was selected, nor did they have it for as long as Ender, and Peter will never forgive Ender for this. Peter hates Ender, and even when the monitor is taken out it does nothing to decrease Peter's anger. The same is true of Ender's schoolmates, and he is forced into brutally beating the leader of a gang of bullies in order to protect himself. Although Valentine tries to protect Ender from Peter, he is only saved from his brother when Colonel Graff of the International Fleet comes to take Ender away to Battle School. Ender leaves behind Valentine, who loves him, in order to help save the world from the buggers.
''']

In [58]:
vectors = tfidf.transform(nineteen_eighty_four)

In [59]:
# WHAT OUR SAMPLE TEXT LOOKS LIKE AFTER THE VECTORIZER
vectors.todense().shape

(1, 14782)

In [60]:
nearest = nn.kneighbors(vectors.todense())

In [61]:
# SIMPLE RECOMMENDATIONS: THE TEN "NEAREST" NEIGHBORS
nearest

(array([[1.2172248 , 1.29650031, 1.31193041, 1.33254164, 1.3368356 ,
         1.34584546, 1.35588396, 1.35592409, 1.36972344, 1.37932265]]),
 array([[763, 144,   7, 134, 348, 255,  54, 195, 736, 524]]))

In [62]:
# THIS IS THE INDEX IN THE DATAFRAME, SO IT IS WHAT WE ITERATE OVER
nearest[1][0]

array([763, 144,   7, 134, 348, 255,  54, 195, 736, 524])

In [63]:
for book_idx in nearest[1][0]:
    print(f"{df['title'][book_idx]}\n{df['description'][book_idx]}\n\n\n")

Ender's Shadow (Ender's Shadow, #1)
Welcome to Battleschool.Growing up is never easy. But try living on the mean streets as a child begging for food and fighting like a dog with ruthless gangs of starving kids who wouldn't hesitate to pound your skull into pulp for a scrap of apple. If Bean has learned anything on the streets, it's how to survive. And not with fists—He is way too small for that—But with brains.Bean is a genius with a magician's ability to zero in on his enemy and exploit his weakness.What better quality for a future general to lead the Earth in a final climactic battle against a hostile alien race, known as Buggers. At Battleschool Bean meets and befriends another future commander—Ender Wiggins—perhaps his only true rival.Only one problem: for Bean and Ender, the future is now.



Peter and the Shadow Thieves (Peter and the Starcatchers, #2)
In this riveting and adventure-packed follow-up to the award-winning New York Times bestseller Peter and the Starcatchers, Peter 