<a href="https://colab.research.google.com/github/andrybrew/text-mining/blob/master/06_Text_Clustering_Putting_It_All_Together.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 06 - DOCUMENT CLUSTERING 

In [None]:
#Release: 1.1909.0901

<br>
 
***If you use Google Colab, install sastrawi package***

In [None]:
!pip install sastrawi

<br>

#### Import required library

In [None]:

import nltk
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
#from sklearn.externals import joblib
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

<br>
 
***If you use Google Colab, download stopwords dan punkt package***

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

<br>

#### Download dataset from github

In [None]:
!git clone https://github.com/project303/dataset.git

In [None]:
!ls dataset

<br>

### Step 01 - Read dataset

In [None]:
#load titles
titles = open('dataset/Judul Berita.txt').read().split('\n')
len(titles)

In [None]:
titles[:10]

In [None]:
print(titles)

In [None]:
article = open('dataset/Berita.txt', encoding="utf8").read().split('BERHENTI DISINI')
len(article)

In [None]:
article = article[:31]
print(article)

In [None]:
article_clean = []
for text in article:
    text = BeautifulSoup(text, 'html.parser').getText()
    article_clean.append(text)
article = article_clean
print(article)

In [None]:
print(str(len(titles)) + ' titles')

In [None]:
print(str(len(article)) + ' article')

<br>

### Step 02 - Tokenization

In [None]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
totalvocab_tokenized = []
for i in article:
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
print(len(totalvocab_tokenized))

In [None]:
print(totalvocab_tokenized)

<br>

### Step 03 - Stemming

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [None]:
totalvocab_stemmed = []
for i in article:
    
    allwords_stemmed = tokenize_and_stem(i) # for each item in 'article', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list

In [None]:
print(len(totalvocab_stemmed))

In [None]:
print(totalvocab_stemmed)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('ada ' + str(vocab_frame.shape[0]) + ' kata di vocab_frame')
print(vocab_frame.head())

<br>

### Step 04 - TF-IDF

In [None]:
ranks = []
for i in range(1, len(titles)+1):
    ranks.append(i)

ranks

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

print('number of stopwords: ' + str(len(stopwords)))
#stopwords

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words = stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

In [None]:
get_ipython().magic(u'time tfidf_matrix = tfidf_vectorizer.fit_transform(article) #fit the vectorizer to article')

In [None]:
print(tfidf_matrix.shape)

In [None]:
print(tfidf_matrix)

In [None]:
terms = tfidf_vectorizer.get_feature_names()
len(terms)

In [None]:
similarity = cosine_similarity(tfidf_matrix)

In [None]:
similarity

<br>

### Step 05 - K-Means Modelling

In [None]:
num_clusters = 3
km = KMeans(n_clusters=num_clusters, random_state=1000)
get_ipython().magic(u'time km.fit(tfidf_matrix)')
clusters = km.labels_.tolist()
#clusters

In [None]:
news = { 'title': titles, 'rank': ranks, 'article': article, 'cluster': clusters }
frame = pd.DataFrame(news, index = [clusters] , columns = ['rank', 'title', 'cluster'])
print(frame) 
frame['cluster'].value_counts() 

In [None]:
grouped = frame['rank'].groupby(frame['cluster']) 
grouped.mean()

In [None]:
print("Top terms per cluster:")
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print()

<br>

### Step 06 - Visualization

In [None]:
similarity_distance = 1 - cosine_similarity(tfidf_matrix)
print(type(similarity_distance))
print(similarity_distance.shape)

In [None]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
get_ipython().magic(u'time pos = mds.fit_transform(similarity_distance)  # shape (n_components, n_samples)')
print(pos.shape)
print(pos)
xs, ys = pos[:, 0], pos[:, 1]
print(type(xs))
xs

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3'}

In [None]:
#set up cluster names using a dict
cluster_names = {0: 'Olahraga', 
                 1: 'Ekonomi', 
                 2: 'Kriminal'}

In [None]:
matplotlib inline

In [None]:
#some ipython magic to show the matplotlib plots inline
get_ipython().magic(u'matplotlib inline')

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

print(df[1:10])

In [None]:
# group by cluster
# this generate {name:group(which is a dataframe)}
groups = df.groupby('label')
print(groups.groups)

In [None]:
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
# ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
# ms: marker size
for name, group in groups:
    #print("*******")
    #print("group name " + str(name))
    #print(group)
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=20, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=10)  

plt.show()