In [1]:
import pandas as pd
import numpy as np

## I. Load and clean the dataset

In [9]:
amreviews = pd.read_csv("amazon-reviews.csv.bz2", sep="\t")
amreviews.sample(10)

Unnamed: 0,date,summary,review,rating
109955,2013-12-31,Love this,This is great for teething baby! On days she ...,4
194271,2015-09-30,Quality and craftmanship,I have used and own many JA Henckels products ...,5
204898,2014-04-30,Does ONE thing well... Just not well enough fo...,"This is a very cheap little practice amp, and ...",3
92962,2012-02-25,cute but...,i really wanted to say this is a 5 star produc...,4
79445,2012-01-28,"Large, but thin","Haven't used this yet, but it is alot larger i...",3
64918,2012-06-16,Good quallity product,I received this as a baby shower gift and I hi...,5
189828,2015-11-29,So fresh and so clean,I've always been very pleased with Bliss produ...,4
113313,2011-05-02,Very Happy,I love the bassinet so far!!! It was very easy...,5
196040,2013-02-16,hydration station,it does what it says it does. only complaint ...,4
205009,2014-04-26,"Spend some time, and you'll love it.",It does take a little getting used to the prog...,5


### Remove missing and empty reviews

In [10]:
#Check for na values
amreviews.review.isna().sum()

80

In [11]:
#Check for empty strings 
np.where(amreviews.review.apply(lambda x: x == ''))

(array([], dtype=int64),)

Based on above we only have nan values. Let's go ahead and drop this.

In [12]:
#drop na 
amreviews_mod = amreviews.drop(amreviews[amreviews.review.isna()].index).reset_index()

Taking a subset of the data

In [14]:
# take a sample of 5000 reviews
np.random.seed(1867)
amreviews_sample = amreviews_mod.sample(5000)

In [15]:
#view sample
amreviews_sample.sample(10)

Unnamed: 0,index,date,summary,review,rating
78644,78674,2011-05-16,love it now that i've gotten used to it!,I had been searching everywhere for a car seat...,4
93715,93752,2012-12-25,Kids love,My toddler loves this toy. She can't turn the...,5
61100,61125,2010-09-04,"Absorbent, but they shrink.",WARNING: Do not use an enzyme spray on these p...,3
120860,120904,2012-10-10,Cheap and works everytime!,This is such a great and amazing buy for mommi...,5
162512,162572,2016-03-15,Perfect,Wanted a natural nail polish. And this is the ...,5
35418,35434,2011-01-11,Decent liners for protecting your diaper,I purchased these because I wanted a liner tha...,4
132678,132729,2013-02-19,Nice gate,This thing must have gone to heck and back bef...,4
51895,51916,2013-02-21,Good product that only requires one hand to op...,"This Diaper Genie really works well, although ...",4
82814,82847,2013-11-18,Goth Mom approved!,"I got this, being a 'goth mom', I was so exci...",5
75834,75862,2014-05-09,Worth the money,I was originally being ghetto and used a sport...,5


## II. TF-IDF Transform

### Transform reviews into Bag-Of-Words

In [16]:
# import
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words

In [21]:
# convert in BOW
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X = vectorizer.fit_transform(amreviews_sample.review.values)

### Write a function that performs TF-IDF

In [50]:
def get_tfidf_matrix(X):
    #function to calculate term freq for each element
    tf = lambda x: np.log(1+x)
    # apply tf lambda on matrix
    tf_result = np.apply_along_axis(tf,1,X.toarray())  
    #Get the total number of documents/reviews
    N = len(X.toarray())
    # Get frequency of each word across all documents
    overall_freq = X.toarray().sum(axis=0)
    # function to calculate idf for each word
    idf = lambda x: np.log(N/(1+x))
    idf_result = idf(overall_freq)
    #multiply tf with idf for each document
    return np.multiply(tf_result,idf_result)

In [52]:
#get tf-idf matrix for reviews
tf_idf = get_tfidf_matrix(X)

## III. Implement k-means clustering using cosine similarity

**Let's make 5 clusters!**

In [59]:
# Assign number of clusters to variables
n = 5
#Generate 5 random indices 
np.random.seed(45678)
indices = np.random.choice(tf_idf.shape[0], n, replace=False)
indices

array([3745,  212, 2756,  411, 1805])

In [62]:
# Store the 5 randomly selected reviews as cluster centres
cluster_centres = tf_idf[indices]