Topic Modelling
1- Used to get information from a very last dataset corpus without reading the actual text
2- eg: from a big criminal dataset we can get the information in which reason the crime is happening more etc.

Drawback of Kmeans:
1- Every document will have only 1 topic, they cannot have multiple topics. To overcome this, LDA came into picture. LDA can have multiple  


In [17]:
import pandas as pd
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
import json 
import glob
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [18]:
def load_data(file):
    with open(file,"r",encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [20]:
def write_data(file,data):
    with open(file,"w",encoding="utf-8") as f:
        json.dump(data,f,indent=4)
    

In [27]:
def remove_stops(text,stops):
    text = re.sub(r"AC\/\d{1,4}\/\d{1,4}", "", text) #remove all of the AC numbers
    words = text.split()
    final = []
    for word in words:
        if word not in stops: #remove stops
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("","",string.punctuation)) #remove punctuations
    final = "".join([i for i in final if not i.isdigit()]) #remove digit
    while "  " in final:
        final = final.replace("  "," ") # remove double space with single space
    return (final)
    

In [26]:
def clean_docs(docs):
    #list of desc
    stops = stopwords.words("english")
    months = load_data("data/months.json")
    stops = stops + months
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc,stops)
        final.append(clean_doc)
    return (final)


In [28]:
descriptions = load_data("data/trc_dn.json")["descriptions"]
names = load_data("data/trc_dn.json")["names"]

In [29]:
cleaned_docs = clean_docs(descriptions)

In [30]:
cleaned_docs[0]

'An ANCYL member shot severely injured SAP members Lephoi Bethulie Orange Free State OFS Police opened fire gathering ANC supporters house following dispute two neighbours one linked ANC SAP councillor'

In [31]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=100,
                                max_df=0.8, #words that occurs more than 80% in all documents delete it
                                min_df=5, #less than 5 remove
                                ngram_range = (1,3), #1,2,3 gram all 3 will run
                                stop_words = "english"
                            )

vectors = vectorizer.fit_transform(cleaned_docs)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)
    
print (descriptions[0])
print (all_keywords[0])



An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['anc', 'anc supporters', 'house', 'injured', 'member', 'members', 'police', 'sap', 'severely', 'shot', 'supporters']


In [33]:
true_k = 20
model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

with open ("data/trc_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")
