In [1]:
#Importing packages
import numpy as np
import pandas as pd
from random import sample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 1. Explore and clean the data

In [2]:
#Importing data
data = pd.read_csv('amazon-reviews.csv.bz2', sep = '\t')
data

Unnamed: 0,date,summary,review,rating
0,2013-07-16,Awesine,Perfect for new parents. We were able to keep ...,5
1,2013-06-29,Should be required for all new parents!,This book is such a life saver. It has been s...,5
2,2014-03-19,Grandmother watching baby,Helps me know exactly how my babies day has go...,5
3,2013-08-17,repeat buyer,I bought this a few times for my older son and...,5
4,2014-04-01,Great,I wanted an alternative to printing out daily ...,4
...,...,...,...,...
205326,2014-07-20,Five Stars,"Great, just as expected. Thank to all.",5
205327,2014-07-02,"Long life, and for some players, a good econom...",I've been thinking about trying the Nanoweb st...,5
205328,2014-07-22,Good for coated.,I have tried coated strings in the past ( incl...,4
205329,2014-07-01,Taylor Made,"Well, MADE by Elixir and DEVELOPED with Taylor...",4


In [3]:
#Checking for missing values
data.isna().sum()

date        0
summary    15
review     80
rating      0
dtype: int64

In [4]:
#Dropping missing values
data.dropna(inplace=True)

In [5]:
#Taking a subset of the dataset
np.random.seed(42)
subset = data.sample(1000)

In [6]:
subset

Unnamed: 0,date,summary,review,rating
136391,2014-05-20,Strong,Strong and safe this car seat is still very co...,5
37818,2011-07-06,Very nice for the price,Got this chair today and it was very easy to p...,5
121138,2013-02-21,Love this bowl!,This bowl is awesome. It sticks right to the t...,5
171043,2014-04-21,"Natural, long lasting, easy application - best...",After trying other bronzers from every known c...,5
100477,2014-05-12,Easy to use and looks great!,I love our new stroller and car seat it was ve...,5
...,...,...,...,...
143953,2013-04-17,Simple Sturdy,Does the job. The ribbon is good quality (alm...,5
150372,2013-02-19,Cute for baby's room!,I was mainly just looking for a hamper that wo...,5
80406,2013-04-10,"Does the job. But also shrinks, fades.",We've been using this Planet Wise wetbag for o...,3
133628,2013-03-06,didn't work for me,"Unfortunately, this did not work for me. I fol...",2


In [7]:
subset.loc[136391, 'review']

'Strong and safe this car seat is still very comfortable and beatiful. Easy to be managed and to be adjusted.'

In [8]:
subset.loc[205189, 'review']

'I needed an inexpensive music stand for my guitar lessons. This worked well for that.'

In [9]:
subset.loc[171043, 'review']

'After trying other bronzers from every known cosmetic company, I have finally found THE one. I wanted a bronzer without glimmer, this is it. I wanted a bronzer that would look natural, this is it. I wanted a bronzer that would stay on all day, this is it. I wanted a natural color, this is it. I wanted a small case for traveling, this is it. Done and done, this us a must have for my cosmetic bag.'

As we can see, these reviews can be categorized as `baby`, `musical instruments` and `beauty` respectively.

# 2. Implement TF-IDF transform

### TF = (Number of time the word occurs in the text) / (Total number of words in text) 

### IDF = (Total number of documents / Number of documents with word t in it)

### Source: https://iyzico.engineering/how-to-calculate-tf-idf-term-frequency-inverse-document-frequency-from-the-beatles-biography-in-c4c3cd968296

In [10]:
#Implementing bag of words
vectorizer = CountVectorizer(binary=False)
X = vectorizer.fit_transform(subset.review.values).toarray()
words = vectorizer.get_feature_names()

In [11]:
X = pd.DataFrame(X)
X.columns = words

In [12]:
#Looking at the dimensions of bag of words
X.shape

(1000, 7051)

In [13]:
#Calculating term frequency
cols = X.columns
tf = X[cols].div(X[cols].sum(axis=1), axis=0)

In [14]:
#Calculating inverse document frequency
temp = pd.DataFrame(np.where(X > 0, 1, 0))
temp.columns = words
idf = np.log(temp.sum(axis = 0))

In [15]:
#Multiplying tf and idf to get tf-idf
test = tf.multiply(idf, axis='columns')

In [16]:
test

Unnamed: 0,00,02,09,0px,10,100,1000,100th,10s,11,...,ziplock,zipped,zipper,zippers,zips,zones,zoo,zoom,zooms,zoya
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Implementing k-Means Clustering

In [17]:
#K-means clustering function
def km(k, tf_idf): 
    #Setting random seed for reproducibility
    np.random.seed(42)
    #Setting k random cluster centres
    centres = tf_idf.sample(n=k)
    #Creating a list of indexes of cluster centres
    centre_ind = centres.index.tolist()
    #Centroids for the first iteration
    centroids = []
    for i in range(len(centre_ind)):
        cluster = []
        cluster.append(centre_ind[i])
        centroids.append(tf_idf.iloc[cluster].mean(axis=0).tolist())
    #Creating a dataframe of these centroids 
    centroid_df = pd.DataFrame(centroids)
    #Calculating the cos similarities between all reviews and cluster centroids
    cos = []
    for i in range(k): # Iterating over each cluster
        cos_in = []
        for j in range(tf_idf.shape[0]): #Iterating over every review
            dot = np.dot(tf_idf.iloc[j],centroid_df.iloc[i]) #Calculating dot product between review and centroid
            anorm = np.linalg.norm(tf_idf.iloc[j]) #Calculating norm of review
            bnorm = np.linalg.norm(centroid_df.iloc[i]) #Calculating norm of centroid
            cos_sim = dot / (anorm * bnorm) #Cosine similarity between review and centroid
            cos_in.append(cos_sim)
        cos.append(cos_in)
    cos_df = (pd.DataFrame(cos)).transpose()
    
    #Assigning review to cluster which has the least cosine similarity value
    cos_df['cluster'] = cos_df.idxmin(axis=1)+1
    cos_df.head()
    
    listclust = cos_df.cluster.unique()
    
    # Further iterations of k-means clustering

    # Initializing a dataframe to hold new centroids
    newcentroid_df = centroid_df.copy()

    # Initializing an empty dataframe to store older centroids
    prevcentroid_df = pd.DataFrame(columns=centroid_df.columns)
    prevcentroid_df = prevcentroid_df.fillna(0)

    # Intializing a dataframe for holding new cosine similarity values for all rows and cluster centres
    newcos_df = cos_df.copy()
    
    # Counter for number of iterations
    iteration_no = 2
    
    # Continuing to iterate till previous centroids match new centroids and number of iterations is less than 30
    while ((prevcentroid_df.equals(newcentroid_df) == False) & (iteration_no < 35)):
        prevcentroid_df = newcentroid_df.copy()  # centroids from previous iteration
        newcentroidlist = []
        for i in range(len(listclust)):
            sublist = newcos_df[newcos_df.cluster == listclust[i]].index.tolist()
            newcentroidlist.append(sublist)
        newcentroids = []
        # Computing new centroids for each cluster by averaging the tf_idf columns
        for cluster in newcentroidlist:
            centroid_list = tf_idf.iloc[cluster].mean(axis=0).tolist()
            newcentroids.append(centroid_list)
        # New Centroids
        newcentroid_df = pd.DataFrame(newcentroids) # centroids from current iteration
        newcos = []
        # Calculating the cos similarities between all reviews and cluster centroids
        for i in range(k):
            cos_in = []
            for j in range(tf_idf.shape[0]):
                dot = np.dot(tf_idf.iloc[j],newcentroid_df.iloc[i])
                anorm = np.linalg.norm(tf_idf.iloc[j])
                bnorm = np.linalg.norm(newcentroid_df.iloc[i])
                cos_sim = dot / (anorm * bnorm)
                cos_in.append(cos_sim)
            newcos.append(cos_in)
        newcos_df = (pd.DataFrame(newcos)).transpose()
        newcos_df['cluster'] = newcos_df.idxmin(axis=1)+1
        iteration_no = iteration_no + 1
    return newcos_df

In [18]:
#Finding clusters for k = 3
final = km(3,test)

In [19]:
#Looking at each cluster size
final.cluster.value_counts()

3    657
2    236
1    107
Name: cluster, dtype: int64

In [20]:
listclust = final.cluster.unique().tolist()
listclust

[1, 3, 2]

In [21]:
finalclusters = []
for i in range(len(listclust)):
    sublist = final[final.cluster == listclust[i]].index.tolist()
    finalclusters.append(sublist)

It looks like the algorithm is looking at words used in the reviews to classify them into clusters. For example, the review "I really like this makeup. It blends in well, it's nice and airy, not heavy or greasy. This color compliments my summer skin tone nicely, a light tan with neutral undertones. I can wear this and be confident that I already have sunscreen protection during the day. It is matte, but I can add shimmer powder or blush on top if I want. I like the coverage and the tint, so I would order more." has been classified correctly in newcluster3 along with other makeup reviews. 

Similar reviews are clustered based on the cosine similarity which is how similar two reviews are in terms of word count of "key" words. For instance, "Good polish" and "Go to polish" are both classified as reviews from the beauty section. This is probably because both reviews have the word "polish". 

In [22]:
# Resetting index of 1000 observation sample
reset_sample = subset.reset_index()
reset_sample.head()

Unnamed: 0,index,date,summary,review,rating
0,136391,2014-05-20,Strong,Strong and safe this car seat is still very co...,5
1,37818,2011-07-06,Very nice for the price,Got this chair today and it was very easy to p...,5
2,121138,2013-02-21,Love this bowl!,This bowl is awesome. It sticks right to the t...,5
3,171043,2014-04-21,"Natural, long lasting, easy application - best...",After trying other bronzers from every known c...,5
4,100477,2014-05-12,Easy to use and looks great!,I love our new stroller and car seat it was ve...,5


In [23]:
reset_sample['review'].iloc[finalclusters[0]]

0      Strong and safe this car seat is still very co...
15     This was a life saver!  My daughter had lost 1...
21     I don't think this was any better than dish so...
53     I would not suggest this until babies get olde...
70     I purchased this as a gift for my high mainten...
                             ...                        
979    meh....... Sorry for my bad reviews on any oth...
980    Love these sheets! Not only are they a great p...
982                       gorgeous - perfect coral color
994    I got this system to replace my internet-based...
999    I needed an inexpensive music stand for my gui...
Name: review, Length: 107, dtype: object

In [24]:
reset_sample['review'].iloc[finalclusters[1]]

1      Got this chair today and it was very easy to p...
4      I love our new stroller and car seat it was ve...
5      I have tried different brands and weights of s...
7      The Playtex Diaper Genie Elite is actually jus...
8      i did something silly, i thought this was supp...
                             ...                        
993    There are a lot of positives to these snack cu...
995    Does the job.  The ribbon is good quality (alm...
996    I was mainly just looking for a hamper that wo...
997    We've been using this Planet Wise wetbag for o...
998    Unfortunately, this did not work for me. I fol...
Name: review, Length: 657, dtype: object

In [25]:
reset_sample['review'].iloc[finalclusters[2]]

2      This bowl is awesome. It sticks right to the t...
3      After trying other bronzers from every known c...
6      This eyeliner is great, a very nice dark navy....
16     This is a delightful scent that adds just a da...
17     My wife says this is easy and comfortable to u...
                             ...                        
971    This definitely does not work as described.  I...
975    Wish you could use it without music.  My littl...
984    I've tried a few carriers and my 2 month old w...
986    This is a great product that will ensure that ...
989    This is my sons favorite toy and mine. It's be...
Name: review, Length: 236, dtype: object

In [26]:
final4 = km(4,test)
final4.cluster.value_counts()

3    999
1      1
Name: cluster, dtype: int64

In [27]:
listclust4 = final4.cluster.unique().tolist()

In [28]:
finalclusters = []
for i in range(len(listclust4)):
    sublist = final4[final4.cluster == listclust4[i]].index.tolist()
    finalclusters.append(sublist)

In [29]:
final5 = km(5,test)
final5.cluster.value_counts()

5    678
3    314
1      4
4      2
2      2
Name: cluster, dtype: int64

In [30]:
final2 = km(2,test)
final2.cluster.value_counts()

2    604
1    396
Name: cluster, dtype: int64