In [None]:
%pip install -r requirements.txt

In [None]:
import progressbar


def progress_bar():
    return progressbar.ProgressBar(maxval=78, widgets=[
        ' [', progressbar.Timer(), '] ',
        progressbar.Bar(marker='0', left='[', right=']'),
        ' (', progressbar.ETA(), ') ',
    ])


In [4]:
# case folding
import json
import re
import time
import sys
import concurrent.futures

with open('data/documents.json') as abstracts_json:
  abstracts = json.load(abstracts_json)

bar = progress_bar()
bar.start()

for i, abstract in enumerate(abstracts):
  answer = re.sub('[^a-z]+', ' ', abstract['abstract'].casefold())
  abstracts[i]['abstract'] = answer

with open('data/case_folded.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))
  
bar.finish()
print('Case folding done')

 [Elapsed Time: 0:00:00] [00000000000000000000000000000000000] (Time: 0:00:00) 

Case folding done





In [6]:
# translate if english
import json
from langdetect import detect
import translators as ts

with open('data/case_folded.json') as case_folded_json:
  abstracts = json.load(case_folded_json)



for i, abstract in enumerate(abstracts):
  detector = detect(abstract['abstract'])
  if detector == 'en':
    translation = ts.google(abstract['abstract'], from_language='en', to_language='id')
    abstracts[i]['abstract'] = translation


with open('data/translated.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))
bar.finish()

 [Elapsed Time: 0:00:01] [00000000000000000000000000000000000] (Time: 0:00:01) 


In [7]:
# case folding again
import json
import re
import time
import sys
import concurrent.futures


processed = []
count = 0
with open('data/translated.json') as abstracts_json:
  abstracts = json.load(abstracts_json)



for i, abstract in enumerate(abstracts):
  answer = re.sub('[^a-z]+', ' ', abstract['abstract'].casefold())
  abstracts[i]['abstract'] = answer

with open('data/case_folded.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))

bar.finish()
print('Case folding done')


 [Elapsed Time: 0:00:00] [00000000000000000000000000000000000] (Time: 0:00:00) 

Case folding done





In [8]:
# stemming
import json
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

with open('data/case_folded.json') as case_folded_json:
  abstracts = json.load(case_folded_json)

factory = StemmerFactory()
stemmer = factory.create_stemmer()


bar = progress_bar()
bar.start()

for i, abstract in enumerate(abstracts):
  stemmed = stemmer.stem(abstract['abstract'])
  abstracts[i]['abstract'] = stemmed


with open('data/stemmed_abstracts.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))

bar.finish()

 [Elapsed Time: 0:01:59] [00000000000000000000000000000000000] (Time: 0:01:59) 


In [9]:
# filtering
import json
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

with open('data/stemmed_abstracts.json') as json_file:
  abstracts = json.load(json_file)



additional_stopwords = []

stopwords = StopWordRemoverFactory().get_stop_words() + additional_stopwords

for i, abstract in enumerate(abstracts):
  words = abstract['abstract'].split()
  filtered = ''
  for word in words:
    if word not in stopwords:
      filtered += word + ' '
  abstracts[i]['abstract'] = filtered

with open('data/filtered.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))   

 [Elapsed Time: 0:00:00] [                                  ] (ETA:  --:--:--) 

In [10]:
import json
import re
import time
import sys
import nltk

nltk.download('punkt')

with open('data/filtered.json') as json_file:
  abstracts = json.load(json_file)



for i, abstract in enumerate(abstracts):
  frequency = nltk.FreqDist(nltk.word_tokenize(abstract['abstract']))
  abstracts[i]['frequency'] = frequency

with open('data/tokenized.json', 'w') as outfile:
  outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))

[nltk_data] Downloading package punkt to /Users/brilyyy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
 [Elapsed Time: 0:00:00] [                                  ] (ETA:  --:--:--) 

In [19]:
# term frequency
import json
import nltk

with open('data/tokenized.json') as json_file:
  abstracts = json.load(json_file)

for i, abstract in enumerate(abstracts):
  result = {}
  bagOfWordsCount = len(abstract['abstract'].split())
  for word, count in abstract['frequency'].items():
    if(count / float(bagOfWordsCount)):
      result[word] = count / float(bagOfWordsCount)
  abstracts[i]['frequency'] = result

with open('data/termfrequency.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))


In [21]:
# inverse document frequency
import json
import nltk
import math


threshold = 4

with open('data/tokenized.json') as json_file:
  abstracts = json.load(json_file)



abstract_length = len(abstracts)

for i, abstract in enumerate(abstracts):
  idfDict = dict.fromkeys(abstract['frequency'].keys(), 0)
  result = {}
  for key in idfDict.keys():
    for abstract in abstracts:
      if key in abstract['frequency']:
        idfDict[key] += 1
  
  for word, val in idfDict.items():
    if val > threshold:
      result[word] = math.log10(abstract_length / float(val) + 1)
  abstracts[i]['frequency'] = result

with open('data/idf.json', 'w') as outfile:
	outfile.write(json.dumps(abstracts, sort_keys=True, indent=4))


In [22]:
# tfidf
import json

with open('data/idf.json') as idf_json:
  idfs = json.load(idf_json)

with open('data/termfrequency.json') as tf_json:
  tfs = json.load(tf_json)



for i, idf in enumerate(idfs):
  tfidf = {}
  maxtfidf = 0
  for word, val in idf['frequency'].items():
    if tfs[i]['frequency'].get(word) is not None:
      tfidf[word] = val*float(tfs[i]['frequency'].get(word))
      if maxtfidf < tfidf[word]:
        maxtfidf = tfidf[word]
        maxtfidfword = word
  idfs[i]['frequency'] = tfidf
  idfs[i]['maxtfidfword'] = maxtfidfword

with open('data/tfidf.json', 'w') as json_file:
  json_file.write(json.dumps(idfs, sort_keys=True, indent=4))



In [23]:
# creating dictionary
import json

with open('data/tfidf.json') as json_file:
  documents = json.load(json_file)


words = []

for i, document in enumerate(documents):
  for word, val in document['frequency'].items():
    if word not in words:
      words.append(word)

words_dict = dict(zip(range(len(words)), words))

with open('data/dictionary.json','w') as outfile:
	outfile.write(json.dumps(words_dict, sort_keys=True, indent=4))




In [24]:
# replace word with dictionary id

import json

with open('data/dictionary.json') as json_file:
  words = json.load(json_file)
with open('data/tfidf.json') as json_file:
  documents = json.load(json_file)

# define function to get key from dictionary
def get_key(val):
  for key, value in words.items():
    if val == value:
      return key

for i, document in enumerate(documents):
  result = {}
  for word, val in document['frequency'].items():
      if word in words.values():
        result[get_key(word)] = val
  documents[i]['frequency'] = result

with open('data/dictionarized.json','w') as outfile:
	outfile.write(json.dumps(documents, sort_keys=True, indent=4))


In [27]:
# hierarchical clustering
import json
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import scipy.cluster.vq as scv
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from pandas import DataFrame
from sklearn.neighbors import NearestCentroid
from sklearn.manifold import MDS

N_CLUSTERS = 10

with open('data/dictionary.json') as json_file:
  dictionary = json.load(json_file)

with open('data/dictionarized.json') as json_file:
  documents = json.load(json_file)

# 

print('Calculating Clusters')
array = []
for i, document in enumerate(documents):
	new = []
	for key in dictionary.keys():
		if not document['frequency'].get(key) is None:
			new.append(document['frequency'].get(key))
		else:
			new.append(0)
	array.append(new)

X = np.array(array)

titles = []

for document in documents:
	titles.append(document['title'])

hierachical = AgglomerativeClustering(n_clusters=N_CLUSTERS, affinity='euclidean', linkage='ward')
cluster = hierachical.fit_predict(X)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(X)


# for i in range(len(cluster)):
# 	print(f'Cluster: {cluster[i]}, Title: {titles[i]} \n')

# fig, dendogram = plt.subplots(figsize=(30,40))

# dendogram = sch.dendrogram(sch.linkage(array, method='ward'), orientation='top')
# plt.title('Hierarchical Clustering Dendrogram')
# plt.ylabel('Article')
# plt.xlabel('Euclidean Distance')
# plt.tight_layout()
# plt.savefig('t'+str(4)+'sw'
#             +'dendogram.png', dpi=200)

dataframe = DataFrame({
		'Cluster': cluster,
		'Title': titles,
})

kmeans_dataframe = DataFrame({
		'Cluster': kmeans.labels_.tolist(),
		'Title': titles,
})

sorted_data_frame = dataframe.sort_values(by=['Cluster'])
sorted_data_frame.to_excel('hierarchical_clustering.xlsx', index=False)
sorted_data_frame = kmeans_dataframe.sort_values(by=['Cluster'])
sorted_data_frame.to_excel('kmeans_clustering.xlsx', index=False)


Calculating Clusters
