#**LDA Topic Modelling and Experiments**

#### Importing required libraries

In [1]:
import gensim

In [2]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.0.0 (from pyLDAvis)
  Downloading pandas-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, numpy, pandas, pyLDAvis
  Attempting uninstall: numpy
    Fou

In [None]:
!pip install gensim

In [3]:
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from gensim.models import TfidfModel
import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import nltk
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder, QuadgramAssocMeasures, QuadgramCollocationFinder
from nltk import word_tokenize

In [4]:
nltk.download('words')
nltk.download('punkt')

  and should_run_async(code)
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### **N-grams, TF-IDF, LDA** (and more pre-processing)

In [5]:
class topic_modeling:
    def __init__(self, facts):
        self.facts = facts

    def organise(self):
        facts_words = []
        for fact in self.facts:
            split_fact = fact.split()
            facts_words.append(split_fact)
        return facts_words

    def remove_lang_words(self,facts_words, english_words):

      # Filtering out non-english words and three letter words
      filtered = [word for word in facts_words if word in english_words and len(word)>3]
      return filtered

    def find_bigrams(self, facts):
      text = ' '.join(facts)
      tokens = word_tokenize(text)
      bigrams = list(nltk.bigrams(tokens))
      return bigrams

    def view_frequencies(self, id2word, texts, corpus, tfidf):
      word_frequency = []
      for doc in corpus:
          # Convert the document to TF-IDF representation
          tfidf_vector = tfidf[doc]

          # Iterate over the TF-IDF values for each word in the document
          for word_id, tfidf_score in tfidf_vector:
              # Get the word corresponding to the word ID
              word = id2word[word_id]

              word_frequency.append([word,tfidf_score])

      df_freq = pd.DataFrame(word_frequency, columns = ['Word', 'Frequency'])
      df_sorted = df_freq.sort_values('Frequency', ascending=False)

      return df_sorted

    def filter_tf_idf(self, id2word, texts, corpus, tfidf):

        low_value = 0.03
        high_value = 0.6
        words = []
        words_missing_in_tfidf = []
        for i in range(0, len(corpus)):
            bow = corpus[i]
            value_words = []  # reinitialize to be safe. You can skip this.
            tfidf_ids = [id for id, value in tfidf[bow]]
            bow_ids = [id for id, value in bow]
            value_words = [id for id, value in tfidf[bow] if value < low_value or value > high_value]
            drops = value_words + words_missing_in_tfidf

            for item in drops:
                words.append(id2word[item])
            words_missing_in_tfidf = [id for id in bow_ids if
                                      id not in tfidf_ids]  # The words with tf-idf socre 0 will be missing

            new_bow = [b for b in bow if b[0] not in value_words and b[0] not in words_missing_in_tfidf]
            corpus[i] = new_bow

        return corpus, id2word

    def perform_lda(self, corpus, id2word):
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=10,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=50,
                                                    passes=10,
                                                    alpha="auto")

        return lda_model

  and should_run_async(code)


## Violation Cases

#### Importing facts corresponding to violation cases

In [6]:
from google.colab import files
import io

  and should_run_async(code)


### Import Processed Facts

In [None]:
uploaded_proc = files.upload()

  and should_run_async(code)


Saving violation_facts_processed.csv to violation_facts_processed.csv


In [None]:
violation_tm = pd.read_csv(io.BytesIO(uploaded_proc['violation_facts_processed.csv']))

### Import Lemmatized Facts

In [None]:
uploaded_lemma = files.upload()

  and should_run_async(code)


Saving violation_facts_lemma.csv to violation_facts_lemma (1).csv


In [None]:
violation_tm = pd.read_csv(io.BytesIO(uploaded_lemma['violation_facts_lemma.csv']))

### Import Stop Word Removal Facts

In [None]:
uploaded_sw = files.upload()

  and should_run_async(code)


Saving violation_facts_sw.csv to violation_facts_sw (1).csv


In [None]:
violation_tm = pd.read_csv(io.BytesIO(uploaded_sw['violation_facts_sw.csv']))

  and should_run_async(code)


### Import No Preprocessing Facts

In [None]:
uploaded_none = files.upload()

  and should_run_async(code)


Saving violation_facts_none.csv to violation_facts_none (1).csv


In [None]:
violation_tm = pd.read_csv(io.BytesIO(uploaded_none['violation_facts_none.csv']))

  and should_run_async(code)


## Non Violation

### Processed

In [7]:
nv_proc = files.upload()

  and should_run_async(code)


Saving non_violation_facts_processed.csv to non_violation_facts_processed.csv


In [8]:
violation_tm = pd.read_csv(io.BytesIO(nv_proc['non_violation_facts_processed.csv']))

  and should_run_async(code)


### Lemma

In [26]:
nv_lemma = files.upload()

  and should_run_async(code)


Saving non_violation_facts_lemma.csv to non_violation_facts_lemma.csv


In [27]:
violation_tm = pd.read_csv(io.BytesIO(nv_lemma['non_violation_facts_lemma.csv']))

  and should_run_async(code)


### SW Removal

In [43]:
nv_sw = files.upload()

  and should_run_async(code)


Saving non_violation_facts_sw.csv to non_violation_facts_sw.csv


In [44]:
violation_tm = pd.read_csv(io.BytesIO(nv_sw['non_violation_facts_sw.csv']))

  and should_run_async(code)


### No Preprocessing

In [60]:
nv_np = files.upload()

  and should_run_async(code)


Saving non_violation_facts_none.csv to non_violation_facts_none.csv


In [61]:
violation_tm = pd.read_csv(io.BytesIO(nv_np['non_violation_facts_none.csv']))

  and should_run_async(code)


In [62]:
violation_tm.head()

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,Facts,Cluster,x0,x1
0,0,in and lives in kragujevac at the relevant tim...,,,
1,1,the applicant was born in and lives in weinhei...,,,
2,2,the applicant company is a public limited comp...,,,
3,3,is a newspaper published in baku it has legal ...,,,
4,4,the applicant was born in and lives in warsaw ...,,,


#### Some more pre-processing

In [69]:
# Splitting into a list of lists of words
facts = violation_tm['Facts'].tolist()
tm = topic_modeling(facts)
facts_words = tm.organise()
print(len(facts_words))

73


  and should_run_async(code)


In [70]:
# Removing non english words
final_data = []
english_words = set(nltk.corpus.words.words())
for i in range(0, len(facts_words)):
  facts_words_filtered = tm.remove_lang_words(facts_words[i], english_words)
  final_data.append(facts_words_filtered)

  and should_run_async(code)


In [12]:
print(final_data[0])

['life', 'relevant', 'time', 'journalist', 'weekly', 'news', 'magazine', 'based', 'town', 'agent', 'party', 'titled', 'shame', 'silence', 'piece', 'written', 'relevant', 'reason', 'ashamed', 'towards', 'head', 'municipal', 'branch', 'office', 'firstly', 'city', 'front', 'main', 'post', 'office', 'building', 'falsely', 'street', 'vendor', 'market', 'inspector', 'pretext', 'certain', 'document', 'copied', 'thereafter', 'drove', 'girl', 'away', 'street', 'tried', 'rape', 'girl', 'origin', 'resistance', 'scream', 'ultimately', 'assailant', 'girl', 'tell', 'anyone', 'give', 'telephone', 'number', 'next', 'head', 'municipal', 'branch', 'office', 'meet', 'also', 'period', 'hour', 'brought', 'investigating', 'assailant', 'false', 'impersonation', 'unlawful', 'deprivation', 'liberty', 'rape', 'shall', 'course', 'conclusion', 'local', 'otherwise', 'prompt', 'come', 'even', 'much', 'lesser', 'simply', 'kept', 'silent', 'incident', 'press', 'also', 'received', 'information', 'investigating', 'pros

  and should_run_async(code)


#### Creating bigrams

In [71]:
bigram_list = []
for data in final_data:
  bigrams = tm.find_bigrams(data)
  bigram_list.append(bigrams)

print(bigram_list[0])

  and should_run_async(code)


[('relevant', 'time'), ('time', 'journalist'), ('journalist', 'weekly'), ('weekly', 'news'), ('news', 'magazine'), ('magazine', 'based'), ('based', 'same'), ('same', 'town'), ('town', 'applicant'), ('applicant', 'lawyer'), ('lawyer', 'government'), ('government', 'were'), ('were', 'their'), ('their', 'agent'), ('agent', 'case'), ('case', 'june'), ('june', 'article'), ('article', 'titled'), ('titled', 'shame'), ('shame', 'through'), ('through', 'silence'), ('silence', 'this'), ('this', 'piece'), ('piece', 'written'), ('written', 'applicant'), ('applicant', 'relevant'), ('relevant', 'reason'), ('reason', 'ashamed'), ('ashamed', 'towards'), ('towards', 'head'), ('head', 'municipal'), ('municipal', 'branch'), ('branch', 'office'), ('office', 'firstly'), ('firstly', 'city'), ('city', 'front'), ('front', 'main'), ('main', 'post'), ('post', 'office'), ('office', 'building'), ('building', 'falsely'), ('falsely', 'himself'), ('himself', 'street'), ('street', 'vendor'), ('vendor', 'market'), ('m

In [72]:
combined_tuples = []
final_bigram_list = []
for bigrams in bigram_list:
  combined_tuples = []
  for tuples in bigrams:
    combined = '_'.join(tuples)
    combined_tuples.append(combined)
  final_bigram_list.append(combined_tuples)
print(final_bigram_list[1])
final_data = final_bigram_list

['applicant_born', 'born_campaigner', 'campaigner_against', 'against_abortion', 'abortion_applicant', 'applicant_distributed', 'distributed_immediate', 'immediate_vicinity', 'vicinity_medical', 'medical_practice', 'practice_inter', 'inter_following', 'following_text', 'text_front', 'front_page', 'page_know', 'know_that', 'that_full', 'full_name', 'name_address', 'address_that', 'that_unlawful', 'unlawful_according', 'according_federal', 'federal_constitutional', 'constitutional_court', 'court_nach', 'nach_sind', 'sind_underneath', 'underneath_following', 'following_smaller', 'smaller_type', 'type_according', 'according_international', 'international_criminal', 'criminal_murder', 'murder_intentional', 'intentional_innocent', 'innocent_human', 'human_being', 'being_tode', 'tode_back', 'back_side', 'side_folded', 'folded_leaflet', 'leaflet_following', 'following_text', 'text_murder', 'murder_human', 'human_unlawful', 'unlawful_morally', 'morally_degraded', 'degraded_state', 'state_murder'

  and should_run_async(code)


#### TFIDF and LDA

In [73]:
id2word = corpora.Dictionary(final_data)
texts = final_data
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = TfidfModel(corpus, id2word=id2word)

df_sorted = tm.view_frequencies(id2word, texts, corpus, tfidf)
df_sorted

  and should_run_async(code)


Unnamed: 0,Word,Frequency
84411,university_hospital,0.679984
149817,anonymous_letter,0.648767
163289,military_service,0.592997
147408,applicant_association,0.515443
69871,prime_minister,0.499757
...,...,...
45993,present_case,0.000056
45939,fact_that,0.000056
129446,interference_with,0.000055
157021,interference_with,0.000051


In [74]:
corpus_filter, id2word_filter = tm.filter_tf_idf(id2word, texts, corpus, tfidf)
lda_model = tm.perform_lda(corpus_filter, id2word_filter)

  and should_run_async(code)


#### Visualisation

Processed

In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_filter, id2word_filter, mds="mmds", R=30)
vis

  and should_run_async(code)


Lemmatized

In [41]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_filter, id2word_filter, mds="mmds", R=30)
vis

  and should_run_async(code)


Only Stop Word Removal

In [58]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_filter, id2word_filter, mds="mmds", R=30)
vis

  and should_run_async(code)


No Pre-processing

In [75]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_filter, id2word_filter, mds="mmds", R=30)
vis

  and should_run_async(code)


In [76]:
# Dataframe with first 5 words of each topic

topics_df = pd.DataFrame(columns=['Topic', 'Top 5 Terms', 'Prob'])
results = lda_model.show_topics(num_topics=10, num_words=5, log=False, formatted=False)

for i in range(0, 10):
    terms_five = []
    prob_five = []
    for j in range(0, 5):
        terms_five.append(results[i][1][j][0])
        prob_five.append(results[i][1][j][1])

    new_row = pd.DataFrame({'Topic': [i], 'Top 5 Terms': [terms_five], 'Prob': [prob_five]})
    topics_df = pd.concat([topics_df, new_row], ignore_index=True)

topics_df.head(10)

  and should_run_async(code)


Unnamed: 0,Topic,Top 5 Terms,Prob
0,0,"[second_applicant, first_applicant, editorial_...","[0.0025694198, 0.0025201715, 0.002372393, 0.00..."
1,1,"[hate_speech, court_cassation, wall_account, a...","[0.00792788, 0.003534366, 0.0027776486, 0.0026..."
2,2,"[administrative_court, military_service, supre...","[0.014369766, 0.010394671, 0.005361035, 0.0053..."
3,3,"[breach_peace, statistical_data, access_offici...","[0.0031214887, 0.001970042, 0.0016735342, 0.00..."
4,4,"[applicant_company, radio_television, national...","[0.022619745, 0.006330287, 0.0048440224, 0.004..."
5,5,"[penal_code, section_penal, commit_suicide, th...","[0.0013777969, 0.0012725499, 0.0010093744, 0.0..."
6,6,"[applicant_company, supreme_court, penal_code,...","[0.0078093857, 0.004838982, 0.00316657, 0.0024..."
7,7,"[court_appeal, personal_data, search_engine, r...","[0.008935143, 0.0062825386, 0.0043751593, 0.00..."
8,8,"[court_cassation, article_code, president_cour...","[0.007679069, 0.0051480443, 0.0036496827, 0.00..."
9,9,"[first_applicant, publish_advertisement, supre...","[0.0033783664, 0.0031147255, 0.0028985357, 0.0..."


  and should_run_async(code)


Coherence score: https://neptune.ai/blog/pyldavis-topic-modelling-exploration-tool-that-every-nlp-data-scientist-should-know