In [1]:
# import all the used libaries
import numpy as np
import math
from sklearn.cluster import KMeans

from random import shuffle
from copy import deepcopy

import nltk
from nltk.probability import FreqDist
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Malk
[nltk_data]     al\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#  read data from files 
corpus_root = 'product_reviews/product_reviews'
#  load all txt files except README.txt
wordlists = PlaintextCorpusReader(corpus_root, '(?!README.txt$)(.+\.txt)')
allFiles = wordlists.fileids()
print(allFiles)

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']


In [3]:
# display word sample form the documents
print(wordlists.words("norton.txt")[:20])

['[', 't', ']', 'software', '[-', '3', ']##', 'I', 'have', 'read', 'the', 'installation', 'instructions', 'for', 'both', 'NIS', '2004', 'and', 'NAV', '2004']


# Cleaning the data

In [4]:
def process_document(document: list) -> list:
        """
        pre-process a document and return a list of tokens
        list->list"""
        wordsList=[]
#         tokenizer = nltk.RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        #keep alphanumeric characters
        tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]{1,}')
        all_stopwords = stopwords.words('english')
        for w in document:
            filtired = tokenizer.tokenize(w.lower())
            try:
                # checks if a word is a stop words
                if not filtired[0] in all_stopwords:
                    # added the word to the list after lemmatization 
                    wordsList.append(lemmatizer.lemmatize(filtired[0]))
            except:
                ...
        return wordsList

#prints a samplew of the tokenized words
print(len(process_document(wordlists.words("norton.txt"))))
print(process_document(wordlists.words("norton.txt"))[:10])
 
# replace most frequent words and return a new list
def replace_words(indxes, target_words, tokenized_docs):
    new_docs = deepcopy(tokenized_docs)
    #loop over all most frequent words indexes
    for i in range(len(indxes)):
        
        # shuffle and split the list by half
        middel = len(indxes[i])//2
        temp_indxes = indxes[i].copy()
        shuffle(temp_indxes)
        
        #replace half of the word occorances
        for index in temp_indxes[:middel]:
            x , y = index
            #print(i,x, y)
            new_docs[x][y] = str(target_words[i+50])
    
    return new_docs

3358
['software', 'read', 'installation', 'instruction', 'ni', 'nav', 'prior', 'installation', 'still', 'ended']


In [5]:
tokenized_words = [] # this list is only used to callculate most_frequent words
tokenized_docs = []

for file in wordlists.fileids():
    tokenized_words += process_document(wordlists.words(file))
#     tokenized_docs.append(process_document(wordlists.words(file)))
    tokenized_docs.append( np.array( process_document(wordlists.words(file) )) )

print(len(tokenized_words))
print(len(tokenized_docs))

# print(tokenized_words[:10])
# print(tokenized_docs[0][:10])

38937
9


In [6]:
# callculate most_frequent words

most_frequent = FreqDist(tokenized_words).most_common(50)
# print(most_frequen)

# create pseudowords
pseudowords = []
for word in most_frequent:
#     print(word[0]," -> " ,word[0][::-1])
    pseudowords.append(word[0][::-1])

#target_words are the 100 words in the matrix 
target_words = []
for word in most_frequent:
    target_words.append(word[0])
    
target_words = target_words + pseudowords
print("target_words ",len(target_words))


# 
# gets most frequent words indexes
# 
indxes = []
for word in target_words[:50]:
    temp = []
    for i in range(len(tokenized_docs)):
        for j in np.where(tokenized_docs[i] == word)[0]:
#             print(i,j)
            temp.append([i,j])
            
    indxes.append(temp)

# print(tokenized_sents[1])
print(len(indxes))

target_words  100
50


In [7]:
def create_matrixX(target_words, tokenized_docs):
    # Number of documents in the collection that contain the 100 words
    small_m = [None for i in range(len(target_words))]

    for i in range(len(target_words)):
        count = 0
        for doc in tokenized_docs:
            if target_words[i] in doc:
                count+=1
        small_m[i] = count

    # print(small_m)

    matrixX = np.empty([len(most_frequent)*2, len(tokenized_docs)])
#     print("matrixX shape: ", len(matrixX), len(matrixX[0]))
    # print(matrixX)

    for i in range(len(target_words)):
        for j in range(len(tokenized_docs)): 
            # calculate tf 
            tf = np.count_nonzero(tokenized_docs[j] == target_words[i]) / len(tokenized_docs[j])
            #tf = tokenized_docs[j].tolist().count(target_words[i]) / len(tokenized_docs[j])

            # calculate idf
            #Total number of documents / Number of documents that contain the word +1
            idf = math.log( (len(tokenized_docs)/(small_m[i]+1) )) +1

            matrixX[i][j] = tf*idf

    return matrixX


In [8]:
 # repeat the experiment  10 time
N = 10
pList = []
for i in range(N):
    # returns a new list with random replaces words
    new_docs = replace_words(indxes, target_words, tokenized_docs)

    #create the matrix 
    matrixX = create_matrixX(target_words, new_docs)
    
    #normalize the matrix by dividing by the l2 norm
    feature_norms = np.linalg.norm(matrixX, axis = 0)
    normalized = matrixX/feature_norms

    # clester the matrix
    kmeans = KMeans(n_clusters=50, random_state=0).fit(normalized)
    pred = kmeans.labels_
#     print(pred[:50])
#     print(pred[50:])

    # checks for empty clusters
    empty_clusters = []
    for i in range(50):
        if i not in pred:
            empty_clusters.append(i)

    print("\nempty clusters: ", len(empty_clusters))
    #     print(empty_clusters)

    # count correct clustring 
    count = 0
    for i in range(50):
        if(pred[i]==pred[i+50]):
            count+=1

    print("correct pairs: ",count)
    p = (count/50)*100
    print("p: ", p, "%")
    
    pList.append(p)

print()
# calculate the mean and standard deviation for P
mean = np.mean(pList)
std = np.std(pList)
print("mean: ", mean)
print("standard deviation: ", std)



empty clusters:  0
correct pairs:  25
p:  50.0 %

empty clusters:  0
correct pairs:  25
p:  50.0 %

empty clusters:  0
correct pairs:  28
p:  56.00000000000001 %

empty clusters:  0
correct pairs:  27
p:  54.0 %

empty clusters:  0
correct pairs:  26
p:  52.0 %

empty clusters:  0
correct pairs:  27
p:  54.0 %

empty clusters:  0
correct pairs:  28
p:  56.00000000000001 %

empty clusters:  0
correct pairs:  22
p:  44.0 %

empty clusters:  0
correct pairs:  32
p:  64.0 %

empty clusters:  0
correct pairs:  28
p:  56.00000000000001 %

mean:  53.6
standard deviation:  4.963869458396343
