In [1]:
import os
import ast
import json
import numpy as np
from pymongo import MongoClient
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from processing import cleaning,tokenize,removeStopwords

import warnings
warnings.filterwarnings('ignore')



In [2]:
def processDocument(document):
    
    contentofInterest = document['description'] + document['short_desc']  # COI
    
    cleanedCOI = cleaning(contentofInterest)
    tokenized_COI = tokenize(cleanedCOI)
    COI_without_sw = removeStopwords(tokenized_COI)
    
    return COI_without_sw

In [3]:
# To generate a single vector for a document by combining vectors of the words of the document
def mean_vector(model,wordsTokens:list):
    if len(wordsTokens) > 0:
        return np.mean(model.wv[wordsTokens],axis = 0)

In [4]:
client = MongoClient('localhost',27017)
db = client['eclipse']
collection = db['initial']

In [5]:
CPU_COUNT = os.cpu_count()
print(CPU_COUNT)

4


In [11]:
with open('sentences1.txt','r') as file:
    check_corpus = file.readlines()

In [12]:
len(check_corpus)

13000

In [43]:
# To convert string dictornary to Python dictionary
corpus = []
for i in range(len(check_corpus)):
    doc = check_corpus[i].strip()
    doc = ast.literal_eval(doc)
    corpus.append(list(doc.values())[0])

## Skip gram model

In [45]:
model = Word2Vec(sentences = corpus,
                 window = 20,
                 min_count = 1,
                 sg = 1,
                 sorted_vocab = 1,
                 workers = CPU_COUNT)

In [46]:
model.train(corpus_iterable = corpus,
            total_words = model.corpus_total_words,
            total_examples = len(corpus),
            start_alpha = 0.025,
            epochs = 140)



(91968742, 100932020)

In [47]:
# Saved the skip-gram model
model.save('./trained_sg.model')

In [51]:
bug_doc1 = collection.find_one({'bug_id':'20'})
bug1 = processDocument(bug_doc1)

In [52]:
bug_doc2 = collection.find_one({'bug_id':'40'})
bug2 = processDocument(bug_doc2)

In [59]:
vector1 = mean_vector(model,bug1)
vector2 = mean_vector(model,bug2)

Word Tokens: ['thought', 'would', 'useful', 'set', 'repo', 'connections', 'could', 'stored', 'somewhere', 'external', 'file', 'system', 'instead', 'everyone', 'list', 'cashed', 'workspace', 'thus', 'get', 'new', 'workspace', 'dont', 'need', 'reconstruct', 'information', 'configure', 'workspace', 'point', 'list', 'also', 'means', 'team', 'could', 'share', 'set', 'repo', 'connections', 'pointing', 'file', 'workspace', 'files']
Word Tokens: ['i', 'would', 'like', 'able', 'connect', 'team', 'stream', 'directly', 'add', 'content', 'workspace', 'this', 'could', 'added', 'new', 'repository', 'wizard', 'currently', 'i', 'need', 'least', 'steps', 'populate', 'new', 'workspace', 'team', 'stream', 'add', 'repository', 'open', 'repository', 'browser', 'expand', 'head', 'select', 'children', 'head', 'add', 'workspace', 'moving', 'future', 'at', 'least', 'team', 'stream', 'cannot', 'used', 'container', 'projects', 'operation', 'would', 'meaningful', 'repository', 'add', 'workspace', 'frequent', 'ope

In [54]:
cosine_similarity([vector1],[vector2])

array([[0.871929]], dtype=float32)