In [37]:
# %pip install -r requirements.txt

In [38]:
# case folding
import json
import re

with open('data/documents-id.json') as abstracts_json:
  abstracts = json.load(abstracts_json)


for i, abstract in enumerate(abstracts):
  answer = re.sub('[^a-z]+', ' ', abstract['abstrak'].casefold())
  abstracts[i]['abstrak'] = answer

with open('data/case_folded.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))
        
print('Case folding done')

Case folding done


In [39]:
# stemming
import json
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from mpstemmer import MPStemmer

# stemmer = MPStemmer()

with open('data/case_folded.json') as case_folded_json:
  abstracts = json.load(case_folded_json)

factory = StemmerFactory()
stemmer = factory.create_stemmer()




for i, abstract in enumerate(abstracts):
  # stemmed = stemmer.stem_kalimat(abstract['abstrak'])
  stemmed = stemmer.stem(abstract['abstrak'])
  abstracts[i]['abstrak'] = stemmed


with open('data/stemmed_abstracts.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))



KeyboardInterrupt: 

In [None]:
# filtering
import json
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

with open('data/stemmed_abstracts.json') as json_file:
  abstracts = json.load(json_file)



additional_stopwords = []

stopwords = StopWordRemoverFactory().get_stop_words() + additional_stopwords

for i, abstract in enumerate(abstracts):
  words = abstract['abstrak'].split()
  filtered = ''
  for word in words:
    if word not in stopwords:
      filtered += word + ' '
  abstracts[i]['abstrak'] = filtered

with open('data/filtered.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))   

In [None]:
import json
import nltk

nltk.download('punkt')

with open('data/filtered.json') as json_file:
  abstracts = json.load(json_file)



for i, abstract in enumerate(abstracts):
  frequency = nltk.FreqDist(nltk.word_tokenize(abstract['abstrak']))
  abstracts[i]['frequency'] = frequency

with open('data/tokenized.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))

[nltk_data] Downloading package punkt to /Users/brilyyy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# term frequency
import json

with open('data/tokenized.json') as json_file:
  abstracts = json.load(json_file)

for i, abstract in enumerate(abstracts):
  result = {}
  bagOfWordsCount = len(abstract['abstrak'].split())
  for word, count in abstract['frequency'].items():
    if(count / float(bagOfWordsCount)):
      result[word] = count / float(bagOfWordsCount)
  abstracts[i]['frequency'] = result

with open('data/termfrequency.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))


In [None]:
# inverse document frequency
import json
import math


threshold = 10

with open('data/tokenized.json') as json_file:
  abstracts = json.load(json_file)



abstract_length = len(abstracts)

for i, abstract in enumerate(abstracts):
  idfDict = dict.fromkeys(abstract['frequency'].keys(), 0)
  result = {}
  for key in idfDict.keys():
    for abstract in abstracts:
      if key in abstract['frequency']:
        idfDict[key] += 1
  for word, val in idfDict.items():
    if val > threshold:
      result[word] = math.log10(abstract_length / float(val) + 1)
  abstracts[i]['frequency'] = result

with open('data/idf.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))


In [None]:
# tfidf
import json

with open('data/idf.json') as idf_json:
  idfs = json.load(idf_json)

with open('data/termfrequency.json') as tf_json:
  tfs = json.load(tf_json)



for i, idf in enumerate(idfs):
  tfidf = {}
  maxtfidf = 0
  for word, val in idf['frequency'].items():
    if tfs[i]['frequency'].get(word) is not None:
      tfidf[word] = val*float(tfs[i]['frequency'].get(word))
      if maxtfidf < tfidf[word]:
        maxtfidf = tfidf[word]
        maxtfidfword = word
  idfs[i]['frequency'] = tfidf
  idfs[i]['maxtfidfword'] = maxtfidfword

with open('data/tfidf.json', 'w') as json_file:
  json_file.write(json.dumps(idfs, sort_keys=True, indent=4))



In [None]:
# creating dictionary
import json

with open('data/tfidf.json') as json_file:
  documents = json.load(json_file)


words = []

for i, document in enumerate(documents):
  for word, val in document['frequency'].items():
    if word not in words:
      words.append(word)

words_dict = dict(zip(range(len(words)), words))

with open('data/dictionary.json','w') as outfile:
	outfile.write(json.dumps(words_dict, sort_keys=True, indent=4))




In [None]:
# replace word with dictionary id

import json

with open('data/dictionary.json') as json_file:
  words = json.load(json_file)
with open('data/tfidf.json') as json_file:
  documents = json.load(json_file)

# define function to get key from dictionary
def get_key(val):
  for key, value in words.items():
    if val == value:
      return key

for i, document in enumerate(documents):
  result = {}
  for word, val in document['frequency'].items():
      if word in words.values():
        result[get_key(word)] = val
  documents[i]['frequency'] = result

with open('data/dictionarized.json','w') as outfile:
	outfile.write(json.dumps(documents, sort_keys=True, indent=4))


In [43]:
# hierarchical clustering
import json
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import scipy.cluster.vq as scv
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from pandas import DataFrame
from sklearn.neighbors import NearestCentroid
from sklearn.manifold import MDS
from sklearn_extra.cluster import KMedoids

N_CLUSTERS = 23

with open('data/dictionary.json') as json_file:
    dictionary = json.load(json_file)

with open('data/dictionarized.json') as json_file:
    documents = json.load(json_file)

#

print('Calculating Clusters')
array = []
for i, document in enumerate(documents):
    new = []
    for key in dictionary.keys():
        if not document['frequency'].get(key) is None:
            new.append(document['frequency'].get(key))
        else:
            new.append(0)
    array.append(new)

X = np.array(array)

titles = []

for document in documents:
    titles.append(document['judul'])

hierachical = AgglomerativeClustering(n_clusters=N_CLUSTERS,
                                      linkage='ward').fit_predict(X)
kmedoids = KMedoids(n_clusters=N_CLUSTERS, max_iter=1000,).fit(X)


dataframe = DataFrame({
  'Cluster': hierachical,
  'Title': titles,
})

kmeds_dataframe = DataFrame({
  'Cluster': kmedoids.labels_.tolist(),
  'Title': titles,
})

sorted_data_frame = dataframe.sort_values(by=['Cluster'])
sorted_data_frame.to_excel('hierarchical_clustering.xlsx', index=False)
# sorted_data_frame = kmeans_dataframe.sort_values(by=['Cluster'])
# sorted_data_frame.to_excel('kmeans_clustering.xlsx', index=False)
sorted_data_frame = kmeds_dataframe.sort_values(by=['Cluster'])
sorted_data_frame.to_excel('kmeds_clustering.xlsx', index=False)


Calculating Clusters
