In [1]:
import pandas as pd
import numpy as np

## I. Load and clean the dataset

In [2]:
amreviews = pd.read_csv("amazon-reviews.csv.bz2", sep="\t")
amreviews.sample(10)

Unnamed: 0,date,summary,review,rating
92392,2012-12-09,Perfect fit,We use prefolds and covers for our primary dia...,5
62576,2012-07-19,No way this thing actually works,"Honestly, the manufacturer says that this ster...",1
195121,2014-03-05,Cable works as described,Bought this for my daughter along with her new...,5
177073,2015-11-29,Love it!,Color by itself looks really delicate. Works p...,5
166082,2014-10-25,Works great for my fine hair,"I have always had baby fine hair, so have trie...",5
68283,2011-05-04,Couldnt stay sane without it,This is hands-down my #1 preggo product - it g...,5
49168,2012-12-10,best bottles out there,love these. the only downside is that there ar...,5
8058,2011-12-20,Apparently I have the old version,When I clicked on the link to provide a review...,5
61701,2011-11-01,Very cute,"Easy to get on and take off, zipper does not s...",5
55802,2013-04-29,Great value and product,I have loved the Medela brand products. They ...,5


### Remove missing and empty reviews

In [3]:
#Check for na values
amreviews.review.isna().sum()

80

In [4]:
#Check for empty strings 
np.where(amreviews.review.apply(lambda x: x == ''))

(array([], dtype=int64),)

Based on above we only have nan values. Let's go ahead and drop this.

In [5]:
#drop na 
amreviews_mod = amreviews.drop(amreviews[amreviews.review.isna()].index).reset_index()

Taking a subset of the data

In [6]:
# take a sample of 5000 reviews
np.random.seed(1867)
amreviews_sample = amreviews_mod.sample(5000)

In [7]:
#view sample
amreviews_sample.sample(10)

Unnamed: 0,index,date,summary,review,rating
78644,78674,2011-05-16,love it now that i've gotten used to it!,I had been searching everywhere for a car seat...,4
93715,93752,2012-12-25,Kids love,My toddler loves this toy. She can't turn the...,5
61100,61125,2010-09-04,"Absorbent, but they shrink.",WARNING: Do not use an enzyme spray on these p...,3
120860,120904,2012-10-10,Cheap and works everytime!,This is such a great and amazing buy for mommi...,5
162512,162572,2016-03-15,Perfect,Wanted a natural nail polish. And this is the ...,5
35418,35434,2011-01-11,Decent liners for protecting your diaper,I purchased these because I wanted a liner tha...,4
132678,132729,2013-02-19,Nice gate,This thing must have gone to heck and back bef...,4
51895,51916,2013-02-21,Good product that only requires one hand to op...,"This Diaper Genie really works well, although ...",4
82814,82847,2013-11-18,Goth Mom approved!,"I got this, being a 'goth mom', I was so exci...",5
75834,75862,2014-05-09,Worth the money,I was originally being ghetto and used a sport...,5


## II. TF-IDF Transform

### Transform reviews into Bag-Of-Words

In [8]:
# import
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words

In [9]:
# convert in BOW
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
X = vectorizer.fit_transform(amreviews_sample.review.values)

### Write a function that performs TF-IDF

In [10]:
def get_tfidf_matrix(X):
    #function to calculate term freq for each element
    tf = lambda x: np.log(1+x)
    # apply tf lambda on matrix
    tf_result = np.apply_along_axis(tf,1,X.toarray())  
    #Get the total number of documents/reviews
    N = len(X.toarray())
    # Get frequency of each word across all documents
    overall_freq = X.toarray().sum(axis=0)
    # function to calculate idf for each word
    idf = lambda x: np.log(N/(1+x))
    idf_result = idf(overall_freq)
    #multiply tf with idf for each document
    return np.multiply(tf_result,idf_result)

In [11]:
#get tf-idf matrix for reviews
tf_idf = get_tfidf_matrix(X)

## III. Implement k-means clustering using cosine similarity

**Let's make 5 clusters!**

In [12]:
# Assign number of clusters to variables
n = 5
#Generate 5 random indices 
np.random.seed(45678)
indices = np.random.choice(tf_idf.shape[0], n, replace=False)
indices

array([3745,  212, 2756,  411, 1805])

In [13]:
# Store the 5 randomly selected reviews as cluster centres
cluster_centres = tf_idf[indices]
#View cluster center
cluster_centres[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
def get_norm(X):

In [15]:
# Create dictionary to hold clusters
cluster_dict = {}

#initialize cluster dictionary
for index in range(n):
    if index == 0:
        cluster_dict[index] = tf_idf.copy()
    else:
        cluster_dict[index] = []

In [52]:
def get_cossim(reviews, centers):
    dot_product = np.dot(reviews, centers.T)
    review_norm = np.linalg.norm(reviews, axis = 1)
    review_norm = review_norm.reshape(reviews.shape[0],1)
    center_norm = np.linalg.norm(centers, axis = 1)
    center_norm = center_norm.reshape(centers.shape[0],1)
    return dot_product / (review_norm.dot(center_norm.T))

print(get_cossim(tf_idf, cluster_centres))

[[0.03473985 0.01613451 0.         0.         0.        ]
 [0.02378835 0.00377097 0.04658059 0.         0.        ]
 [0.14375591 0.         0.         0.         0.03508982]
 ...
 [0.02127519 0.01978556 0.02517125 0.         0.00385946]
 [0.         0.01863297 0.         0.         0.12962742]
 [0.         0.01753468 0.02914705 0.         0.01378514]]
