In [2]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import numpy as np
import re
import json

In [3]:
model = Word2Vec.load("idwiki_word2vec_200.model")

In [4]:
dataset = []
with open('dataset.json') as json_file:
    dataset = json.load(json_file)

In [5]:
sw_remover = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

In [6]:
def preprocess(document):
    document = sw_remover.remove(document)
    document_stem = stemmer.stem(document).split(" ")
    document_token = [w for w in document_stem if w.isalpha()]
    return document_token

In [7]:
def predict_decease(input_document, docsim_index, dictionary):
    query = preprocess(input_document)
    sims = docsim_index[dictionary.doc2bow(query)]
    predict_result = sims[0]
    
    data = dataset[predict_result[0]]
    weight = predict_result[1]
    
    result_dict = {'data' : data, 'weight': weight}
                   
    return result_dict

In [8]:
s = "Aku ingin mencoba makanan yang sedang dihidangkan mama"
s = sw_remover.remove(s)
tokens = stemmer.stem(s).split(" ")

In [9]:
def doc_similarity(s1, s2, model):
    
    s1 = sw_remover.remove(s1)
    tokens1 = stemmer.stem(s1).split(" ")

    s2 = sw_remover.remove(s2)
    tokens2 = stemmer.stem(s2).split(" ")

    tokens1 = [token for token in tokens1 if token in model.wv]
    tokens2 = [token for token in tokens2 if token in model.wv]
    
#     print(tokens1)
#     print(tokens2)
#     print("------")
    
    if len(tokens1) == 0 or len(tokens2) == 0:
        return 0

    tokfreqs1 = Counter(tokens1)
    tokfreqs2 = Counter(tokens2)
    
#     print("Document 1\n")
#     print(tokfreqs1)
#     print("Document 2\n")
#     print(tokfreqs2)
    
    weights1 = [model.wv[token] / model.wv.vocab[token].count for token in tokfreqs1]
    weights2 = [model.wv[token] / model.wv.vocab[token].count for token in tokfreqs2]
        
    embedding1 = np.average([model.wv[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
    embedding2 = np.average([model.wv[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)
    
    sim = cosine_similarity(embedding1, embedding2)[0][0]
        
    return sim

In [171]:
with open('doc1.txt', 'r') as file:
    doc1 = file.read().replace('\n', '')

with open('doc2.txt', 'r') as file:
    doc2 = file.read().replace('\n', '')
    
with open('doc3.txt', 'r') as file:
    doc3 = file.read().replace('\n', '')
    
print(doc_similarity(doc1, doc2, model))
# print(doc_similarity(doc2, doc3, model))
print(doc_similarity(doc1, doc3, model))

0.03939446822973372
0.0266333867430677


In [None]:
for docs in dataset:
    str_gejala = ""
    for gejala in docs['gejala']:
        str_gejala += "." + gejala
    
    predict = []
    max_sim = 0
    for d in dataset:
        str_gejala_test = ""
        for gejala in d['gejala']:
            str_gejala_test += "." + gejala
        
        sim = doc_similarity(str_gejala, str_gejala_test, model)
        if(max_sim < sim) :
            max_sim = sim
            predict.append(d['judul'])

cor = 0
for i in range(len(predict)):
    if(predict[i] == dataset[i]['judul']):
        cor += 1

print(cor / len(predict) * 100)

In [8]:
# Create Term Similarity Index from Word2Vec model

termsim_index = WordEmbeddingSimilarityIndex(model.wv)

# Create Corpus List
corpus_list = []
for data in dataset:
    docs = ""
    for sentence in data['gejala']:
        docs += " " + sentence
    corpus_list.append(docs)

# Create token list for all document corpus
corpus_list_token = [preprocess(doc) for doc in corpus_list]
# 
dictionary = Dictionary(corpus_list_token)
bow_corpus = [dictionary.doc2bow(document) for document in corpus_list_token]

# Create Term similarity matrix
similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

# Compute Soft Cosine Similarity
docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)

In [27]:
text = "Diare (10 hingga 12 kali per hari) Diare disertai darah.Kram pada perut.Buang air besar yang kental.Gas dalam perut.Gejala yang umum seperti demam, sakit punggung, dan lelah." 
predict = predict_decease(text, docsim_index, dictionary)
predict

  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))
  Y = np.multiply(Y, 1 / np.sqrt(Y_norm))


{'data': {'judul': 'Sepsis Neonatorum',
  'definisi': 'Sepsis Neonatorum adalah suatu infeksi bakteri berat yang menyebar ke seluruh tubuh bayi baru lahir.\nSepsis terjadi pada kurang dari 1% bayi baru lahir tetapi merupakan penyebab dari 30% kematian pada bayi baru lahir.\nInfeksi bakteri 5 kali lebih sering terjadi pada bayi baru lahir yang berat badannya kurang dari 2,75 kg dan 2 kali lebih sering menyerang bayi laki-laki.\nPada lebih dari 50% kasus, sepsis mulai timbul dalam waktu 6 jam setelah bayi lahir, tetapi kebanyakan muncul dalamw aktu 72 jam setelah lahir.\nSepsis yang baru timbul dalam waktu 4 hari atau lebih kemungkinan disebabkan oleh infeksi nasokomial (infeksi yang didapat di rumah sakit).\n',
  'gejala': ['Gangguan pernafasan',
   'Kejang',
   'Jaundice (sakit kuning)',
   'Muntah',
   'Diare',
   'Perut kembung.']},
 'weight': 0.8293212056159973}

In [50]:
sentence_end = re.compile(r'''[.!?]['"]?\s{1,2}(?=)''')
input_sentences = re.split(r"\.|\?|\!",text)

print(docs_similarity(input_sentences, [sentence for sentence in predict['data']['gejala']], model))

['diare', 'hari', 'diare', 'serta', 'darah']
['ganggu', 'nafas']
------
['kram', 'perut']
['kejang']
------
['buang', 'air', 'kental']
['jaundice', 'sakit', 'kuning']
------
['gas', 'perut']
['muntah']
------
['gejala', 'umum', 'demam', 'sakit', 'punggung', 'lelah']
['diare']
------
[]
['perut', 'kembung']
------
[-0.06878397, 0.09454537, -0.04392944, 0.004184945, -0.10668429, 0]


['Gangguan pernafasan',
 'Kejang',
 'Jaundice (sakit kuning)',
 'Muntah',
 'Diare',
 'Perut kembung.']