## Make dataset

In [None]:
import string

def clean_text(corpus):
    # Remove punctuations from the corpus
    translator = str.maketrans('', '', string.punctuation)
    corpus = corpus.translate(translator)

    # Remove digits from the corpus
    remove_digits = str.maketrans('', '', string.digits)
    corpus = corpus.translate(remove_digits)
    return corpus

In [None]:
import json
import requests
import glob
import string
files = glob.glob("../data/path/steps/*")
files_txt = glob.glob("../data/path/txt/*")
import re
def remove_all_cjk(text):
    return re.sub(r'[\u3000-\u30FF\u4E00-\u9FFF\uAC00-\uD7AF]', '', text)

def remove_punctuation(text):
    return re.sub(r'[:;!?“”]', '', text)

with open(f'../data/path/dataset.csv', 'w') as out_file:
    out_file.write("id,article,text\n")                
    for file in files:
        contents = open(file).readlines()
        step = int(contents[0].strip())
        
        text = open(files_txt[step-1]).read()
        text = remove_punctuation(remove_all_cjk(text))
        text = text.strip().replace("\n","").replace("\r","").replace("\t"," ").strip()
        text = clean_text(text)
        

        if text!="":
            out = [str(step), text[0:32], "\"" + str(text)+ "\""] 
            out_file.write(','.join(out)+"\n")
            if step > 10000:
                break


In [None]:
import pandas as pd
df = pd.read_csv("../data/path/dataset.csv", on_bad_lines='skip')
df.head()

In [None]:
labels = df["article"].tolist()
labels[:10]

In [None]:
documents = df['text'].tolist()
corpus = " ".join(documents).lower()

In [None]:
corpus[:1000]

# Document Clustering

In [None]:
cleaned_documents = []
for document in documents:
    document = clean_text(document).lower()
    cleaned_documents.append(document)
documents = cleaned_documents
documents[0]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import time
from sklearn.cluster import KMeans

In [None]:
class DocumentCluster:
    """Class to cluster document"""
    def __init__(self,model,data,labels):
        self.model = model
        self.data = data
        self.labels = labels
        self.model_name = self._get_model_name()
        
        self.cluster_pipeline = None
        
    
    def _get_model_name(self):
        """Get name of model being used"""
        model_name = str(self.model).split('(')[0]
        return model_name
            

    
    def train_model(self,ngram,use_idf):
        """ 
            Train model
            ngram: int
            use_idf: bool
            analyzer: string
        
        """
        current_time = time.time()
        self._ngram = ngram
        self._use_idf = use_idf
        
        # Set a pipeline
        # 1. The first step of the pipeline is to find count vectorizer
        # Countvectorizer converts a collection of text documents to a matrix of token counts
        # This implementation produces a sparse representation of the counts
        # 2. We then use TfidfTransformr
        # TfidfTransformer transforms a count matrix to a normalized tf or tf-idf representation
        # Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency

        self.cluster_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,ngram), 
                analyzer="word")), ('tfidf', TfidfTransformer(use_idf=use_idf)), 
                ('model', self.model)])
        self._cluster = self.cluster_pipeline.fit(self.data)
        
        self._train_time = time.time() - current_time
        
        
        print("Training completed")
        print("Training time for {} : {} secs".format(self.model_name, self._train_time))
        
    
    def predict(self):
        """
        Predict cluster id for all clusters
        """
        self.label_id_dict = {}
        self.document_id_dict = {}
        self.cluster_id = []
        for i, document in enumerate(self.data):
            cluster_id = self.cluster_pipeline.predict([document])[0]
            self.label_id_dict.update({self.labels[i]:cluster_id})
            self.document_id_dict.update({document:cluster_id})
            self.cluster_id.append(cluster_id)
        
        return self.label_id_dict, self.document_id_dict

In [None]:
cluster_number = 5

In [None]:
model = KMeans(n_clusters=cluster_number, init = "k-means++", max_iter=150, n_init=1)

In [None]:
doc_cluster = DocumentCluster(data=documents,model=model,labels=labels)

In [None]:
doc_cluster.train_model(ngram=2, use_idf=True)

In [None]:
result_label, result_docs = doc_cluster.predict()

In [None]:
result_label