In [1]:
# import all the used libaries
import numpy as np
import math
from sklearn.cluster import KMeans
import gensim
from gensim.models import Word2Vec

from random import shuffle
from copy import deepcopy

import nltk
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Malk
[nltk_data]     al\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#  read data from files 
corpus_root = 'product_reviews/product_reviews'
#  load all txt files except README.txt
wordlists = PlaintextCorpusReader(corpus_root, '(?!README.txt$)(.+\.txt)')
allFiles = wordlists.fileids()

# print files names
print(allFiles)

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']


In [3]:
# returns a list of tokenized words
def tokenize_words(document: list) -> list:
        """
        pre-process a document and return a list of tokens (words)
        list->list"""
        wordsList=[]
        #keep alphanumeric characters
        tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]{2,}')
        lemmatizer = WordNetLemmatizer()
        all_stopwords = stopwords.words('english')
        for w in document:
            # checks if a word is a stop words
            filtired = tokenizer.tokenize(w.lower())
            try:
                if not filtired[0] in all_stopwords:
                    # added the word to the list after lemmatization 
                    wordsList.append(lemmatizer.lemmatize(filtired[0]))
            except:
                ...
        return wordsList
# returns a list of tokenized sentences
def tokenize_sentence(document: list, min_length = 1) -> list:
        sents = []
        for sent in document:
            words=[]
            #keep alphanumeric characters
            tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]{2,}')
            lemmatizer = WordNetLemmatizer()                         
            all_stopwords = stopwords.words('english')
            for w in sent:
                filtired = tokenizer.tokenize(w.lower())
                try:
                    if not filtired[0] in all_stopwords:
                        # added the word to the list after lemmatization 
                        words.append(lemmatizer.lemmatize(filtired[0]))
                except:
                    ...
            if len(words) >= min_length:
                sents.append(np.array(words))
            
        return sents
    
print(wordlists.sents("norton.txt")[0],"\n")
print(len(tokenize_sentence(wordlists.sents("norton.txt"))))
print(tokenize_sentence(wordlists.sents("norton.txt"))[:2])

['[', 't', ']', 'software', '[-', '3', ']##', 'I', 'have', 'read', 'the', 'installation', 'instructions', 'for', 'both', 'NIS', '2004', 'and', 'NAV', '2004', 'prior', 'to', 'installation', ',', 'but', 'still', 'ended', 'up', 'with', 'the', 'same', 'result', '...', 'junk', 'software', '.'] 

379
[array(['software', 'read', 'installation', 'instruction', 'ni', 'nav',
       'prior', 'installation', 'still', 'ended', 'result', 'junk',
       'software'], dtype='<U12'), array(['install', 'type', 'software', 'installs', 'work', 'properly'],
      dtype='<U8')]


In [4]:
# this list is only used to callculate most_frequent words
tokenized_words = []

# this list is used to store the tokenized sents from all docs
# tokenized_sents = []

for file in wordlists.fileids():
    tokenized_words += tokenize_words(wordlists.words(file))
    
#     tokenized_sents += tokenize_sentence(wordlists.sents(file), 1)

print(len(tokenized_words))
# print(len(tokenized_sents))

38334


In [5]:
# callculate most frequent words
most_frequent = FreqDist(tokenized_words).most_common(50)
print(most_frequent)

# create pseudowords
pseudowords = []
for word in most_frequent:
    pseudowords.append(word[0][::-1])

#target_words are the 100 words in the matrix 
target_words = []
for word in most_frequent:
    target_words.append(word[0])
    
target_words = target_words + pseudowords
print(len(target_words))

[('use', 353), ('phone', 351), ('one', 337), ('router', 337), ('ipod', 329), ('camera', 322), ('player', 313), ('get', 274), ('battery', 264), ('diaper', 231), ('product', 214), ('work', 211), ('like', 196), ('great', 192), ('time', 186), ('feature', 183), ('problem', 179), ('quality', 176), ('good', 176), ('zen', 175), ('would', 158), ('also', 156), ('sound', 153), ('computer', 150), ('software', 148), ('picture', 141), ('well', 138), ('really', 136), ('micro', 136), ('take', 128), ('easy', 125), ('even', 123), ('thing', 123), ('first', 121), ('used', 120), ('need', 119), ('creative', 118), ('bag', 116), ('want', 115), ('much', 115), ('better', 114), ('mp', 113), ('champ', 113), ('look', 110), ('go', 106), ('size', 106), ('music', 105), ('norton', 104), ('little', 101), ('price', 99)]
100


In [6]:
# replace most frequent words
def replace_words(indxes, target_words):
    new_sents = deepcopy(tokenized_sents)
    
    #loop over all most frequent words indexes
    for i in range(len(indxes)):
        
        # shuffle and split the list by half
        middel = len(indxes[i])//2
        temp_indxes = indxes[i].copy()
        shuffle(temp_indxes)
    #     print(len(temp_indxes[:middel]), len(indxes[i]))
    #     print(temp_indxes[:10])
    #     print(indxes[i][:10])
        for index in temp_indxes[:middel]:
            x , y = index
    #         print(i,x, y)
            new_sents[x][y] = target_words[i + 50]
    
    return new_sents
        

In [7]:
# creates a matrix from word2vec weights
def create_matrixX(target_words, model):
    matrixX = []
    for word in target_words:
        matrixX.append(model.wv[word])
    
    return matrixX

In [8]:
# this list is used to store the tokenized sents from all docs
tokenized_sents = []

for file in wordlists.fileids():    
    tokenized_sents += tokenize_sentence(wordlists.sents(file), 1)

print(len(tokenized_sents))

# 
# gets most frequent words indexes
# 
indxes = []
for word in target_words[:50]:
    temp = []
    for i in range(len(tokenized_sents)):
        for j in np.where(tokenized_sents[i] == word)[0]:
#             print(i,j)
            temp.append([i,j])
            
    indxes.append(temp)

# print(tokenized_sents[1])
print(len(indxes))

# checks if indxes is correct
# for i in range(50):
#     if len(indxes[i]) != most_frequent[i][1]:
#         print(i)

4244
50


In [9]:
# replace target_words
new_sents = replace_words(indxes, target_words)

# convert numpy array to normal python list
for i in range(len(new_sents)):
    new_sents[i] = new_sents[i].tolist()

print(new_sents[:3])

[['sd', 'yllaer', 'enjoyed', 'shooting', 'canon', 'powershot', 'sd'], ['design', 'exterior', 'design', 'combine', 'form', 'function', 'elegantly', 'point', 'shoot', 'ever', 'tested'], ['image', 'processing', 'system', 'digic', 'ii', 'powered', 'image', 'processing', 'system', 'enables', 'sd', 'snap', 'limitless', 'stream', 'megapixel', 'photo', 'respectable', 'clip', 'start', 'emit', 'top', 'class', 'delivers', 'decent', 'photo', 'compared', 'competition']]


In [10]:
# repeat the experiment  10 time
N = 10
pList = []
for i in range(N):
    # replace target_words
    new_sents = replace_words(indxes, target_words)
    for i in range(len(new_sents)):
        new_sents[i] = new_sents[i].tolist()

    # creates word2vec model
    model = Word2Vec(new_sents, min_count = 11, vector_size = 500, window = 7, sg=1)
    model.wv.most_similar('use', topn=10)

    # creates a matrix from word2vec weights
    matrixX = create_matrixX(target_words, model)
    
    # clester the matrix
    kmeans = KMeans(n_clusters=50, random_state=0).fit(matrixX)
    pred = kmeans.labels_
#     print(pred[:50])
#     print(pred[50:])

    # checks for empty clusters
    empty_clusters = []
    for i in range(50):
        if i not in pred:
            empty_clusters.append(i)

    print("\nempty clusters: ", len(empty_clusters))
#     print(empty_clusters)


    # count correct clustring 
    count = 0
    for i in range(50):
        if(pred[i]==pred[i+50]):
    #         print(i, pred[i],pred[i+50])
            count+=1
    
    print("correct pairs: ",count)
    p = (count/50)*100
    print("p: ", p, "%")
    
    pList.append(p)

    
print()
mean = np.mean(pList)
std = np.std(pList)
print("mean: ", mean)
print("standard deviation: ", std)


empty clusters:  0
correct pairs:  28
p:  56.00000000000001 %

empty clusters:  0
correct pairs:  23
p:  46.0 %

empty clusters:  0
correct pairs:  26
p:  52.0 %

empty clusters:  0
correct pairs:  29
p:  57.99999999999999 %

empty clusters:  0
correct pairs:  27
p:  54.0 %

empty clusters:  0
correct pairs:  28
p:  56.00000000000001 %

empty clusters:  0
correct pairs:  30
p:  60.0 %

empty clusters:  0
correct pairs:  25
p:  50.0 %

empty clusters:  0
correct pairs:  22
p:  44.0 %

empty clusters:  0
correct pairs:  29
p:  57.99999999999999 %

mean:  53.4
standard deviation:  5.0635955604688645


# Testing window size

In [11]:
# testing best windo size from 1 to 12
for w in range(1,13):
    # repeat the experiment  10 time
    N = 10
    pList = []
    for i in range(N):
        # replace target_words
        new_sents = replace_words(indxes, target_words)
        for i in range(len(new_sents)):
            new_sents[i] = new_sents[i].tolist()

        model = Word2Vec(new_sents, vector_size = 500, window = w, sg=1)
        model.wv.most_similar('use', topn=10)

        matrixX = create_matrixX(target_words, model)
        kmeans = KMeans(n_clusters=50, random_state=0).fit(matrixX)
        pred = kmeans.labels_

        count = 0
        for i in range(50):
            if(pred[i]==pred[i+50]):
                count+=1

        p = (count/50)*100    
        pList.append(p)


    print()
    print("size: ",w)
    mean = np.mean(pList)
    std = np.std(pList)
    print("mean: ", mean)
    print("standard deviation: ", std)


size:  1
mean:  21.4
standard deviation:  4.8207883172775805

size:  2
mean:  42.8
standard deviation:  3.9191835884530843

size:  3
mean:  52.2
standard deviation:  3.1559467676119017

size:  4
mean:  56.0
standard deviation:  4.289522117905443

size:  5
mean:  56.0
standard deviation:  4.6475800154488995

size:  6
mean:  59.8
standard deviation:  6.029925372672534

size:  7
mean:  57.8
standard deviation:  6.660330322138685

size:  8
mean:  52.2
standard deviation:  3.3999999999999995

size:  9
mean:  50.6
standard deviation:  4.004996878900159

size:  10
mean:  50.6
standard deviation:  4.651881339845203

size:  11
mean:  49.6
standard deviation:  4.363484845854286

size:  12
mean:  46.6
standard deviation:  6.069596362197408


# Testing vector size

In [12]:
# testing best vector size from 10 to 120
for w in range(1,13):
    # repeat the experiment  10 time
    N = 10
    pList = []
    for i in range(N):
        # replace target_words
        new_sents = replace_words(indxes, target_words)
        for i in range(len(new_sents)):
            new_sents[i] = new_sents[i].tolist()

        model = Word2Vec(new_sents, vector_size = (w)*10, window = 7, sg=1)
        model.wv.most_similar('use', topn=10)

        matrixX = create_matrixX(target_words, model)
        kmeans = KMeans(n_clusters=50, random_state=0).fit(matrixX)
        pred = kmeans.labels_

        count = 0
        for i in range(50):
            if(pred[i]==pred[i+50]):
                count+=1

        p = (count/50)*100    
        pList.append(p)


    print()
    print("size: ",w*10)
    mean = np.mean(pList)
    std = np.std(pList)
    print("mean: ", mean)
    print("standard deviation: ", std)


size:  10
mean:  36.8
standard deviation:  6.939740629158989

size:  20
mean:  47.0
standard deviation:  5.0

size:  30
mean:  50.4
standard deviation:  5.043808085167396

size:  40
mean:  57.0
standard deviation:  7.113367697511496

size:  50
mean:  51.8
standard deviation:  4.237924020083421

size:  60
mean:  54.6
standard deviation:  4.737087712930805

size:  70
mean:  57.4
standard deviation:  4.386342439892262

size:  80
mean:  52.8
standard deviation:  4.4

size:  90
mean:  55.2
standard deviation:  5.306599664568639

size:  100
mean:  57.0
standard deviation:  5.0

size:  110
mean:  57.2
standard deviation:  4.1182520563948

size:  120
mean:  57.4
standard deviation:  4.386342439892262


In [13]:
# testing best vector size from 100 to 1200
for w in range(1,13):
    # repeat the experiment  10 time
    N = 10
    pList = []
    for i in range(N):
        # replace target_words
        new_sents = replace_words(indxes, target_words)
        for i in range(len(new_sents)):
            new_sents[i] = new_sents[i].tolist()

        model = Word2Vec(new_sents, vector_size = (w)*100, window = 7, sg=1)
        model.wv.most_similar('use', topn=10)

        matrixX = create_matrixX(target_words, model)
        kmeans = KMeans(n_clusters=50, random_state=0).fit(matrixX)
        pred = kmeans.labels_

        count = 0
        for i in range(50):
            if(pred[i]==pred[i+50]):
                count+=1

        p = (count/50)*100    
        pList.append(p)


    print()
    print("size: ",w*100)
    mean = np.mean(pList)
    std = np.std(pList)
    print("mean: ", mean)
    print("standard deviation: ", std)


size:  100
mean:  56.2
standard deviation:  4.044749683231337

size:  200
mean:  59.0
standard deviation:  4.02492235949962

size:  300
mean:  56.6
standard deviation:  8.296987405076615

size:  400
mean:  54.6
standard deviation:  5.8

size:  500
mean:  54.6
standard deviation:  5.219195340279955

size:  600
mean:  55.2
standard deviation:  2.9933259094191507

size:  700
mean:  53.4
standard deviation:  4.983974317750845

size:  800
mean:  54.6
standard deviation:  5.370288632839021

size:  900
mean:  53.6
standard deviation:  4.630334761116088

size:  1000
mean:  55.4
standard deviation:  7.432361670424818

size:  1100
mean:  50.8
standard deviation:  3.2496153618543855

size:  1200
mean:  54.0
standard deviation:  6.752777206453653


## Testing min word count

In [14]:
# testing min word count from 1 to 25
for w in range(1,25):
    # repeat the experiment  10 time
    N = 5
    pList = []
    for i in range(N):
        # replace target_words
        new_sents = replace_words(indxes, target_words)
        for i in range(len(new_sents)):
            new_sents[i] = new_sents[i].tolist()

        model = Word2Vec(new_sents, min_count = w, vector_size = 600, window = 7, sg=1)
        model.wv.most_similar('use', topn=10)

        matrixX = create_matrixX(target_words, model)
        kmeans = KMeans(n_clusters=50, random_state=0).fit(matrixX)
        pred = kmeans.labels_

        count = 0
        for i in range(50):
            if(pred[i]==pred[i+50]):
                count+=1

        p = (count/50)*100    
        pList.append(p)


    print()
    print("size: ",w)
    mean = np.mean(pList)
    std = np.std(pList)
    print("mean: ", mean)
    print("standard deviation: ", std)


size:  1
mean:  53.2
standard deviation:  2.7129319932501086

size:  2
mean:  55.6
standard deviation:  5.571355310873647

size:  3
mean:  54.0
standard deviation:  5.215361924162118

size:  4
mean:  55.2
standard deviation:  4.664761515876241

size:  5
mean:  53.6
standard deviation:  3.4409301068170497

size:  6
mean:  53.6
standard deviation:  1.9595917942265442

size:  7
mean:  53.2
standard deviation:  4.118252056394799

size:  8
mean:  55.2
standard deviation:  2.9933259094191538

size:  9
mean:  56.0
standard deviation:  4.381780460041329

size:  10
mean:  57.2
standard deviation:  2.712931993250107

size:  11
mean:  54.8
standard deviation:  4.118252056394796

size:  12
mean:  52.4
standard deviation:  5.276362383309165

size:  13
mean:  53.2
standard deviation:  4.833218389437829

size:  14
mean:  54.4
standard deviation:  1.4966629547095796

size:  15
mean:  55.2
standard deviation:  6.764613810115105

size:  16
mean:  54.4
standard deviation:  2.653299832284319

size:  17
m