# Implementing LDA in Python

## Introduction

## Importing the Required Libraries

In [52]:
import pandas as pd
from nltk.corpus import stopwords
import string
import gensim.corpora as corpora
from gensim.models import LdaMulticore

In [5]:
df = pd.read_csv("../data/trc.csv")
df = df[["Last", "First", "Description"]]
df

Unnamed: 0,Last,First,Description
0,AARON,Thabo Simon,An ANCYL member who was shot and severely inju...
1,ABBOTT,Montaigne,A member of the SADF who was severely injured ...
2,ABRAHAM,Nzaliseko Christopher,A COSAS supporter who was kicked and beaten wi...
3,ABRAHAMS,Achmat Fardiel,Was shot and blinded in one eye by members of ...
4,ABRAHAMS,Annalene Mildred,Was shot and injured by members of the SAP in ...
...,...,...,...
20829,XUZA,Mandla,Was severely injured when he was stoned by a f...
20830,YAKA,Mbangomuni,An IFP supporter and acting induna who was sho...
20831,YALI,Khayalethu,"Was shot by members of the SAP in Lingelihle, ..."
20832,YALO,Bikiwe,An IFP supporter whose house and possessions w...


In [13]:
docs = df.Description.tolist()
docs[:1]

["An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor."]

In [11]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [28]:
def clean_doc(doc):
    no_punct = ''
    for c in doc:
        if c not in string.punctuation:
            no_punct = no_punct+c
    # with list comprehension
    # no_punct = ''.join([c for c in doc if c not in string.punctuation])
    
    words = no_punct.lower().split()
    
    final_words = []
    for word in words:
        if word not in stop_words:
            final_words.append(word)
    
    # with list comprehension
    # final_words = [word for word in words if word not in stop_words]

    return final_words
cleaned = clean_doc(docs[0])
print(docs[0])
print(cleaned)

An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['ancyl', 'member', 'shot', 'severely', 'injured', 'sap', 'members', 'lephoi', 'bethulie', 'orange', 'free', 'state', 'ofs', '17', 'april', '1991', 'police', 'opened', 'fire', 'gathering', 'anc', 'supporters', 'house', 'following', 'dispute', 'two', 'neighbours', 'one', 'linked', 'anc', 'sap', 'councillor']


In [31]:
cleaned_docs = [clean_doc(doc) for doc in docs]
print(cleaned_docs[:3])

[['ancyl', 'member', 'shot', 'severely', 'injured', 'sap', 'members', 'lephoi', 'bethulie', 'orange', 'free', 'state', 'ofs', '17', 'april', '1991', 'police', 'opened', 'fire', 'gathering', 'anc', 'supporters', 'house', 'following', 'dispute', 'two', 'neighbours', 'one', 'linked', 'anc', 'sap', 'councillor'], ['member', 'sadf', 'severely', 'injured', 'landmine', 'explosion', 'messina', 'transvaal', '5', 'may', '1987'], ['cosas', 'supporter', 'kicked', 'beaten', 'batons', 'riflebutts', 'members', 'ciskei', 'police', 'protests', 'ciskei', 'government', 'zwelitsha', 'mdantsane', 'ciskei', 'september', '1985']]


## Create ID-Word Index

In [33]:
id2word = corpora.Dictionary(cleaned_docs)

In [37]:
id2word[250]

'bmw'

In [40]:
id_docs = [id2word.doc2bow(cleaned_doc) for cleaned_doc in cleaned_docs]

In [47]:
id_docs[100]

[(17, 1),
 (24, 1),
 (26, 1),
 (30, 1),
 (46, 1),
 (53, 1),
 (55, 1),
 (85, 1),
 (184, 1),
 (621, 1)]

In [46]:
cleaned_docs[100]

['shot',
 'dead',
 'members',
 'sap',
 'protests',
 'cradock',
 'cape',
 '14',
 'august',
 '1987']

In [51]:
for num in id_docs[100]:
    num = num[0]
    print(num, id2word[num])

17 members
24 sap
26 shot
30 1987
46 protests
53 august
55 cape
85 dead
184 14
621 cradock


## Creating LDA Topic Model

In [53]:
topic_nums = 100
lda_model = LdaMulticore(corpus=id_docs, id2word=id2word, num_topics=topic_nums)

In [59]:
lda_model.print_topics(1)

[(79,
  '0.025*"anc" + 0.025*"supporters" + 0.018*"members" + 0.018*"ifp" + 0.015*"attack" + 0.015*"shot" + 0.014*"house" + 0.012*"burnt" + 0.012*"home" + 0.010*"member"')]

## Analyze the Topic Model

In [71]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

In [75]:
vis = pyLDAvis.gensim_models.prepare(lda_model, id_docs, id2word, mds="mmds", R=30)

In [74]:
vis