In [207]:
import requests
import re
from bs4 import BeautifulSoup
import random
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

<h4>Scrape 100 Amazon Recommended Books to Read:</h4>

In [230]:
link = "https://www.amazon.com/b?ie=UTF8&node=8192263011"
html = requests.get(link).text
bs = BeautifulSoup(html,"html.parser")
books = []
for link in bs.find_all("span",{"class":"a-size-small"}):
    books.append(link.text)

In [231]:
print(len(books))

100


<h4>Generate 5 Modified Books for each Book:</h4>

In [277]:
book_labels = []
modified_labels = []
modified_books = []
lbl = 0
for i in books: 
    token_book = i.split()
    word_len = len(token_book)
    book_labels.append(lbl)
    #Filter out books of length <= 2
    if len(token_book) > 2:
        for j in range(5): 
            random_start = random.randint(0,word_len-1)
            random_word_len = random.randint(2,word_len-1)
            if (random_start+random_word_len) > len(token_book[random_start:]): 
                b_start = token_book[random_start:]
                b_end = token_book[0:random_word_len - len(b_start)]
                b = b_start + b_end
            else: 
                b = token_book[random_start:(random_start+random_word_len)]
            modified_labels.append(lbl)
            modified_books.append(' '.join(b))
    lbl = lbl+1

In [281]:
print(len(modified_books))

435


<h4>Put Together Our Clustering Dataset:</h4>

In [282]:
all_titles = books+modified_books
all_labels = book_labels+modified_labels
print(len(all_titles),"\n")
print(all_titles[0:100:20])

535 

['1984 (Signet Classics)', 'Diary of a Wimpy Kid, Book 1', 'Me Talk Pretty One Day', 'The Corrections: A Novel', 'The Poisonwood Bible: A Novel']


<h4>Pre-process Our Data:</h4>

In [251]:
def clean_data(raw_text): 
    letters = re.sub("[^a-zA-Z0-9]", " ",raw_text)                #Keeps only letters and numbers
    lower_case = letters.lower()                                  #Keeps all words lowercase
    tok = lower_case.split()                                      #Tokenizes
    stop_words = set(stopwords.words("english"))                  #Create set of stopwords
    wrds = [w for w in tok if w not in stop_words]                #Removes stopwords
    return(" ".join(wrds))

In [252]:
df_data = pd.DataFrame(all_titles,columns=['Book_Title'])
print(df_data.head(10))

#Apply cleaning function on dataframe 
df_data["cleaned_data"] = df_data["Book_Title"].apply(lambda i: clean_data(i))

#numpy array of cleaned data
cleaned_data = df_data["cleaned_data"].values
print("\n",cleaned_data[0:5])

                              Book_Title
0                 1984 (Signet Classics)
1                A Brief History of Time
2  A Heartbreaking Work of Staggering...
3   A Long Way Gone: Memoirs of a Boy...
4        The Bad Beginning: Or, Orphans!
5                      A Wrinkle in Time
6            Selected Stories, 1968-1994
7    Alice's Adventures in Wonderland...
8                All the President's Men
9               Angela's Ashes: A Memoir

 ['1984 signet classics' 'brief history time'
 'heartbreaking work staggering' 'long way gone memoirs boy'
 'bad beginning orphans']


<h4>Generate tf-idf Features:</h4>

In [253]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,stop_words=None)
vec_features = vectorizer.fit_transform(cleaned_data)  #The counts

In [254]:
X_counts = vec_features.toarray()
transformer = TfidfTransformer(smooth_idf=False)   #Create transformer
X_tfidf = transformer.fit_transform(X_counts)     #Fit transformer on the counts 
X_tfidf = X_tfidf.toarray()                       #numpy array of the tf-idf features

<h4>Optional - Dimensionality Reduction: PCA</h4>
<p>If there is a very large  number of documents then the dimensionality of the features could be huge. To combat this, PCA can be used. With our data, the dimensionality is manageable so we do not need to use PCA but I leave this here in case the dataset is swapped out for a larger one. Please note that the X_tfidf in the following km.fit will need to be changed to red_dim_X_tfidf if PCA is used.</p>

In [255]:
components = 233
pca = PCA(n_components=components)
pca.fit(X_tfidf)
red_dim_X_tfidf = pca.transform(X_tfidf)

In [256]:
print(red_dim_X_tfidf.shape)
print(X_tfidf.shape)

(535, 233)
(535, 238)


<h4>Create K-Means Object:</h4>

In [261]:
k = 100     #1 cluster for each book 
km = KMeans(n_clusters=k, init='k-means++', max_iter=100)
km.fit(X_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=100, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [267]:
cluster_map = pd.DataFrame()
cluster_map['data_index'] = list(df_data["Book_Title"].values)
cluster_map['cluster'] = km.labels_

In [270]:
print(cluster_map.head(10))

                              data_index  cluster
0                 1984 (Signet Classics)       34
1                A Brief History of Time        2
2  A Heartbreaking Work of Staggering...       74
3   A Long Way Gone: Memoirs of a Boy...       66
4        The Bad Beginning: Or, Orphans!       11
5                      A Wrinkle in Time       51
6            Selected Stories, 1968-1994       79
7    Alice's Adventures in Wonderland...       26
8                All the President's Men       68
9               Angela's Ashes: A Memoir       21


<h4>Some Sample Cluster Values:</h4>

In [269]:
for i in range(100):
    print("Cluster", i,": \n",cluster_map[cluster_map.cluster == i],"\n")

Cluster 0 : 
            data_index  cluster
74   The Long Goodbye        0
420          The Long        0
421          The Long        0
422          The Long        0
423      Long Goodbye        0
424          The Long        0 

Cluster 1 : 
          data_index  cluster
56   The Book Thief        1
335        The Book        1
336      Book Thief        1
337      Book Thief        1
338       Thief The        1
339      Book Thief        1
398        (Book 1)        1
399    (Book 1) The        1 

Cluster 2 : 
                   data_index  cluster
1    A Brief History of Time        2
105          of Time A Brief        2
106        History of Time A        2
107    Brief History of Time        2
108          History of Time        2
109        History of Time A        2
129                  in Time        2 

Cluster 3 : 
            data_index  cluster
275  Novel Midnight's        3
276           A Novel        3
311           A Novel        3
313           A Novel        3
3

<h4>Performance - Purity:</h4>

In [288]:
def purity_score(clusters, classes):
    
    A = np.c_[(clusters,classes)]

    n_accurate = 0.

    for j in np.unique(A[:,0]):
        z = A[A[:,0] == j, 1]
        x = np.argmax(np.bincount(z))
        n_accurate += len(z[z == x])

    return n_accurate / A.shape[0]

In [289]:
print("Purity: ",purity_score(km.labels_,all_labels))

Purity:  0.9102803738317757
