<a href="https://colab.research.google.com/github/ahsanuamal/ahsanuamal/blob/main/Text_Mining_NLP_Text_Clustering_Use_Case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 06 - DOCUMENT CLUSTERING 

In [None]:
#Release: 1.2108.0101

## Library Preparation

<br>
 
***If you use Google Colab, install sastrawi package***

In [None]:
!pip install sastrawi

<br>

#### Import required library

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from bs4 import BeautifulSoup
 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS

import matplotlib.pyplot as plt

<br>

#### Download punctuation

<br>
 
***If you use Google Colab, download stopwords dan punkt package***

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

<br>

#### User defined function

In [None]:
def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)
            
    return cleaned_token

In [None]:
#stem using Sastrawi StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def stemming_text(tokenized_text):
    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems


In [None]:
def text_preprocessing(text):
    result_step1 = tokenize_clean(text)
    result_step2 = remove_stopwords(result_step1)
    result = stemming_text(result_step2)
            
    return result

#### text_preprocessing Function Test

In [None]:
dataset = 'Di daerah, alokasi anggaran Transfer ke Daerah dan Dana Desa (TKDD) ditetapkan sebesar Rp766,16 triliun pada APBN 2018'

In [None]:
text_prep_result = text_preprocessing(dataset)

In [None]:
len(text_prep_result)

In [None]:
text_prep_result

<br>

## Dataset Collection

<br>

#### Download dataset from github

In [None]:
!mkdir -p dataset

In [None]:
!wget https://raw.githubusercontent.com/project303/dataset/master/Berita.txt -P dataset

In [None]:
!wget https://raw.githubusercontent.com/project303/dataset/master/Judul-Berita.txt -P dataset

In [None]:
!ls dataset

In [None]:
! head dataset/Judul-Berita.txt

## Data Preprocessing

<br>

### Step 01 - Read dataset

In [None]:
#load titles
article_titles = open('dataset/Judul-Berita.txt').read().split('\n')
len(article_titles)

In [None]:
article_titles[:10]

In [None]:
article_content = open('dataset/Berita.txt', encoding="utf8").read().split('BERHENTI DISINI')
len(article_content)

In [None]:
article_content[0]

<br>

### Step 02 - Cleanup dataset


Cleanup dataset from HTML tags using BeautifulSoup

In [None]:
article_clean = []
for text in article_content:
    text = BeautifulSoup(text, 'html.parser').getText()
    article_clean.append(text)
  
article_content = article_clean

In [None]:
article_content[0]

## Feature Extraction

<br>

### Step 03 - TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.1,
                                   max_features=200000,
                                   use_idf=True,
                                   tokenizer=text_preprocessing,
                                   ngram_range=(1,3))

In [None]:
get_ipython().magic(u'time tfidf_features = tfidf_vectorizer.fit_transform(article_content)')

In [None]:
print(tfidf_features.shape)

In [None]:
bag_of_words = tfidf_vectorizer.get_feature_names()
len(bag_of_words)

In [None]:
bag_of_words[:10]

## Model Building

<br>

### Step 04 - K-Means Modelling

In [None]:
num_clusters = 3
model_km = KMeans(n_clusters=num_clusters, random_state=1000)

#train the model
get_ipython().magic(u'time model_km.fit(tfidf_features)')

In [None]:
clusters = model_km.labels_.tolist()

<br>

### Step 05 - View The Result

In [None]:
article_no = []
for i in range(1, len(article_titles)+1):
    article_no.append(i)

In [None]:
article_cluster = { 'title': article_titles, 'no': article_no, 'article': article_content, 'cluster': clusters }
pd.set_option('display.max_colwidth', None)
df = pd.DataFrame(article_cluster, index = [clusters] , columns = ['no', 'title', 'cluster'])
df.sort_index()  

In [None]:
df['cluster'].value_counts()

Top words per cluster

In [None]:
df_bow = pd.DataFrame({'words': bag_of_words}, index = bag_of_words)

In [None]:
print("Top words per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = model_km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s' % df_bow.loc[bag_of_words[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in df.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print()

<br>

## Cluster Visualization

### Step 07 - Visualization

In [None]:
similarity_distance = 1 - cosine_similarity(tfidf_features)
print(type(similarity_distance))
print(similarity_distance.shape)

In [None]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
get_ipython().magic(u'time pos = mds.fit_transform(similarity_distance)  # shape (n_components, n_samples)')
print(pos.shape)

In [None]:
print(pos)

In [None]:
xs, ys = pos[:, 0], pos[:, 1]
print(type(xs))
xs

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3'}

In [None]:
#set up cluster names using a dict
cluster_names = {0: 'Ekonomi', 
                 1: 'Kriminal',
                 2: 'Olahraga'}

In [None]:
matplotlib inline

In [None]:
#some ipython magic to show the matplotlib plots inline
get_ipython().magic(u'matplotlib inline')

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=article_titles)) 

print(df[1:10])

In [None]:
# group by cluster
# this generate {name:group(which is a dataframe)}
groups = df.groupby('label')
print(groups.groups)

In [None]:
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
# ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
# ms: marker size
for name, group in groups:
    #print("*******")
    #print("group name " + str(name))
    #print(group)
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=20, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=10)  

plt.show()

In [None]:
article_similarities = cosine_similarity(tfidf_features[0], tfidf_features).flatten()

In [None]:
article_similarities

Simple example cosine similarities

In [None]:
search_terms = 'tomatoes is a fruit'
documents = ['cars drive on the road', 'tomatoes are actually fruit']

vectorizer = TfidfVectorizer(use_idf=True)
doc_vectors = vectorizer.fit_transform(documents)
search_vector = vectorizer.transform([search_terms])

cosine_similarities = cosine_similarity(search_vector, doc_vectors).flatten()
document_scores = [item.item() for item in cosine_similarities]

document_scores

<br>
<br>

**Revision History:**

Release: 1.2108.0101
*   Cleanup the code
*   Add cossine simimilarities
