# Experiment 5 - Text Clustering
#### Data Preprocessing - Uniques words retrieval - Removing stop words
#### Identifying top 500 words 
#### Embedding / Vectorization
#### K Means to calculate words with highest feature values

In [1]:
import pandas as pd
from collections import defaultdict
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
import itertools
import operator
from sklearn.cluster import KMeans
import numpy as np
import json
import copy
from prettytable import PrettyTable

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aswathsabarri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dict_words = defaultdict(list)

In [3]:
# We extract the values of the document through a dictionary for easy retreival of review/text field for further purposes
with open("finefoods.txt.nosync.txt",encoding = "utf-8", errors="replace") as f:
    count = 0
    for line in f:
        #print(line)
        if(line != "\n"):
            try:
                (key, val) = line.split(':',1)
                #print(key,val)
                dict_words[key].append(val.replace('\n', ''))
            except:
                print("(Key,val) error",count+1)
                count += 1

(Key,val) error 1
(Key,val) error 2
(Key,val) error 3
(Key,val) error 4
(Key,val) error 5
(Key,val) error 6
(Key,val) error 7


In [4]:
#  Creating a dataframe that holds the review/text values
df_rev_txt = pd.DataFrame(dict_words["review/text"])


In [5]:
# As required we are identifying the unique words in the data. Using L as a dictionary to hold the count of all unique words present in the data
L = dict()
tknzr = nltk.RegexpTokenizer(r"\w+")

for i, row in df_rev_txt.iterrows():

    words = tknzr.tokenize(row[0].lower())
    freq = nltk.FreqDist(words)
    
    for word,count in freq.items():
        if(word not in L):
            L[word] = count
        else:
            L[word]+= count

In [6]:
in_file = open("stop_.txt", "r")
stopping_words = in_file.read()
stop_words = stopping_words.split("\n")
in_file.close()

In [7]:
# Removing stop words from our words dataset(dictionary) L
keys = list(L.keys())
for key in keys:
    if key in stop_words:
        del L[key]

In [8]:
# Acquiring the 500 most common words based on their count
new_L = copy.deepcopy(L)
new_L = dict(sorted(new_L.items(), key=operator.itemgetter(1),reverse=True))
top_500_common = dict(itertools.islice(new_L.items(), 500))

In [9]:
# top_500_common holds the the 500 most common words

top_500_table = PrettyTable(["Name","count"])
for key,val in top_500_common.items():
    top_500_table.add_row([key,val])

print(top_500_table)


+--------------+--------+
|     Name     | count  |
+--------------+--------+
|      br      | 647112 |
|      t       | 301913 |
|     good     | 200638 |
|    taste     | 172856 |
|   product    | 167739 |
|    great     | 167175 |
|    coffee    | 166782 |
|    flavor    | 148023 |
|     tea      | 138198 |
|     food     | 128513 |
|     love     | 127520 |
|     will     | 127169 |
|    amazon    | 106374 |
|     don      | 91873  |
|      ve      | 85410  |
|     time     | 84767  |
|     buy      | 76916  |
|     best     | 76836  |
|    price     | 75984  |
|     find     | 73520  |
|     well     | 73408  |
|    better    | 70767  |
|     dog      | 69852  |
|     eat      | 67787  |
|     cup      | 62910  |
|    water     | 61618  |
|      2       | 61394  |
|  chocolate   | 61147  |
|     bag      | 58257  |
|    sugar     | 57397  |
|    sweet     | 52708  |
|    drink     | 51176  |
|     box      | 50826  |
|     free     | 50706  |
|      1       | 50457  |
|    bought 

In [10]:
words = list(top_500_common.keys())
words

['br',
 't',
 'good',
 'taste',
 'product',
 'great',
 'coffee',
 'flavor',
 'tea',
 'food',
 'love',
 'will',
 'amazon',
 'don',
 've',
 'time',
 'buy',
 'best',
 'price',
 'find',
 'well',
 'better',
 'dog',
 'eat',
 'cup',
 'water',
 '2',
 'chocolate',
 'bag',
 'sugar',
 'sweet',
 'drink',
 'box',
 'free',
 '1',
 'bought',
 'day',
 '3',
 'store',
 'tastes',
 'order',
 '5',
 'bit',
 'recommend',
 'nice',
 'delicious',
 'favorite',
 'flavors',
 'mix',
 'hot',
 'brand',
 'cat',
 'dogs',
 'stuff',
 '4',
 'years',
 'treats',
 'loves',
 'lot',
 'healthy',
 'add',
 'quality',
 'll',
 'didn',
 'organic',
 'chips',
 'ingredients',
 'milk',
 'small',
 'doesn',
 'snack',
 'perfect',
 'pack',
 'ordered',
 'strong',
 'products',
 'keep',
 'bad',
 'eating',
 'sure',
 'easy',
 'treat',
 'salt',
 'long',
 'green',
 'hard',
 'fresh',
 'high',
 'enjoy',
 'bags',
 'oil',
 'definitely',
 'buying',
 'thing',
 'regular',
 'cookies',
 'natural',
 'chicken',
 'thought',
 'size',
 'work',
 'pretty',
 'cups'

In [11]:
# Vectorization of words
vectorized = []
tokenizer = nltk.RegexpTokenizer(r"\w+")
for index, row in df_rev_txt.iterrows():
    vector = [0]*500

    for token in tokenizer.tokenize(row[0].lower()):
        if token in words:
            vector[words.index(token)]+=1
    vectorized.append(vector)

In [12]:
# Using Kmeans to create the cluster and obtain cluster centroids
numpy_vector = np.array([np.array(x) for x in vectorized])
kmeans = KMeans(n_clusters=10)
kmeans.fit(numpy_vector)

KMeans(n_clusters=10)

In [13]:
knn_centroids = kmeans.cluster_centers_

n = 5
parent_list = []
for arr in knn_centroids:
    indices = (-arr).argsort()[:n]
    list_of_words = []
    for j in indices:
        list_of_words.append(words[j])
    parent_list.append(list_of_words)

In [14]:
#  The parent list holds the most 5 significant words in each of the centroids
parent_list

[['coffee', 'cup', 't', 'good', 'taste'],
 ['t', 'good', 'great', 'product', 'taste'],
 ['tea', 'br', 't', 'green', 'flavor'],
 ['br', 't', 'good', 'taste', 'product'],
 ['br', 'water', 'tea', '1', 't'],
 ['br', 't', 'good', 'taste', 'product'],
 ['food', 'br', 'dog', 'cat', 't'],
 ['br', 't', 'food', 'product', 'good'],
 ['coffee', 'br', 'cup', 't', 'taste'],
 ['br', 't', '1', '2', 'good']]