In [175]:
import requests
import re
from bs4 import BeautifulSoup
import random
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans

<h4>Scrape 80 Amazon Recommended Books to Read:</h4>

In [130]:
link = "https://www.amazon.com/b?ie=UTF8&node=8192263011"
html = requests.get(link).text
bs = BeautifulSoup(html,"html.parser")
books = []
for link in bs.find_all("span",{"class":"a-size-small"}):
    books.append(link.text)

In [158]:
print(books[0:10])

['1984 (Signet Classics)', 'A Brief History of Time', 'A Heartbreaking Work of Staggering...', 'A Long Way Gone: Memoirs of a Boy...', 'The Bad Beginning: Or, Orphans!', 'A Wrinkle in Time', 'Selected Stories, 1968-1994', "Alice's Adventures in Wonderland...", "All the President's Men", "Angela's Ashes: A Memoir"]


<h4>Generate 5 Modified Books for each Book:</h4>

In [147]:
modified_books = []
for i in books: 
    token_book = i.split()
    word_len = len(token_book)
    if len(token_book) != 1:
        for j in range(5): 
            random_start = random.randint(0,word_len-1)
            random_word_len = random.randint(1,word_len-1)
            modified_books.append(' '.join(token_book[random_start:random_start+random_word_len])) #tokenized to string

<h4>Put Together Our Clustering Dataset:</h4>

In [157]:
all_titles = books+modified_books
print(all_titles[0:100:20])

['1984 (Signet Classics)', 'Diary of a Wimpy Kid, Book 1', 'Me Talk Pretty One Day', 'The Corrections: A Novel', 'The Poisonwood Bible: A Novel']


<h4>Pre-process Our Data:</h4>

In [162]:
def clean_data(raw_text): 
    letters = re.sub("[^a-zA-Z]", " ",raw_text)                   #Keeps only letters
    lower_case = letters.lower()                                  #Keeps all words lowercase
    tok = lower_case.split()                                      #Tokenizes
    stop_words = set(stopwords.words("english"))                  #Create set of stopwords
    wrds = [w for w in tok if w not in stop_words]                #Removes stopwords
    return(" ".join(wrds))

In [242]:
df_data = pd.DataFrame(all_titles,columns=['Book_Title'])
print(df_data.head(10))

#Apply cleaning function on dataframe 
df_data["cleaned_data"] = df_data["Book_Title"].apply(lambda i: clean_data(i))

#numpy array of cleaned data
cleaned_data = df_data["cleaned_data"].values
print("\n",cleaned_data[0:5])

                              Book_Title
0                 1984 (Signet Classics)
1                A Brief History of Time
2  A Heartbreaking Work of Staggering...
3   A Long Way Gone: Memoirs of a Boy...
4        The Bad Beginning: Or, Orphans!
5                      A Wrinkle in Time
6            Selected Stories, 1968-1994
7    Alice's Adventures in Wonderland...
8                All the President's Men
9               Angela's Ashes: A Memoir

 ['signet classics' 'brief history time' 'heartbreaking work staggering'
 'long way gone memoirs boy' 'bad beginning orphans']


<h4>Generate tf-idf Features:</h4>

In [180]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None,max_features=200)
vec_features = vectorizer.fit_transform(cleaned_data)  #The counts

In [181]:
X_counts = vec_features.toarray()
transformer = TfidfTransformer(smooth_idf=False)   #Create transformer
X_tfidf = transformer.fit_transform(X_counts)     #Fit transformer on the counts 
X_tfidf = X_tfidf.toarray()                       #numpy array of the tf-idf features

<h4>Create K-Means Object:</h4>

In [186]:
k = 80 #1 cluster for each book 
km = KMeans(n_clusters=k, init='k-means++', max_iter=100)
km.fit(X_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=80, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [255]:
cluster_map = pd.DataFrame()
cluster_map['data_index'] = list(cleaned_data)
cluster_map['cluster'] = km.labels_

In [256]:
print(cluster_map)

                             data_index  cluster
0                       signet classics       72
1                    brief history time       45
2         heartbreaking work staggering        0
3             long way gone memoirs boy       74
4                 bad beginning orphans       38
5                          wrinkle time       11
6                      selected stories       27
7           alice adventures wonderland       78
8                         president men       28
9                   angela ashes memoir        0
10                         god margaret        0
11                          bel canto p        0
12                              beloved        0
13                born run hidden tribe       51
14   breath eyes memory oprah book club        2
15         catch th anniversary edition       73
16            charlie chocolate factory        0
17                        charlotte web       71
18                        cutting stone       39
19               dar

<h4>Some Sample Cluster Values:</h4>

In [272]:
print("Cluster 1: \n",cluster_map[cluster_map.cluster == 1],"\n")
print("Cluster 3: \n",cluster_map[cluster_map.cluster == 3],"\n")
print("Cluster 5: \n",cluster_map[cluster_map.cluster == 5],"\n")
print("Cluster 39: \n",cluster_map[cluster_map.cluster == 39],"\n")
print("Cluster 51: \n",cluster_map[cluster_map.cluster == 51],"\n")

Cluster 1: 
        data_index  cluster
66   great gatsby        1
226         great        1
227         great        1
425         great        1
427         great        1
428         great        1
429  great gatsby        1 

Cluster 3: 
                         data_index  cluster
44   human bondage bantam classics        3
315           human bondage bantam        3
316                  human bondage        3
317  human bondage bantam classics        3
318           human bondage bantam        3
319                  human bondage        3 

Cluster 5: 
                                data_index  cluster
33   kitchen confidential updated edition        5
260                       updated edition        5
261                       updated edition        5
262          confidential updated edition        5 

Cluster 39: 
                       data_index  cluster
18                 cutting stone       39
28   harry potter sorcerer stone       39
187                        stone    