# Part 1: LDA Analysis

## Environment setup

In [2]:
import os
import pandas as pd

## Install LDA prerequisites

## Train tokenizer

## Load Part 1 sample data - CC hearing transcripts

In [3]:
# fetch and load thesis sample Part 1 dataset
import glob

file_list = glob.glob(os.path.join(os.getcwd(), 
                                   "/home/schwing/Documents/masters_thesis/data/CC_data_clean/",
                                  "*.txt"))

corpus = []

for file_path in file_list:
    with open(file_path) as f_input:
        corpus.append(f_input.read())

type(corpus)

list

In [5]:
# preview sample data
corpus

['charlene cieslik giles dixon ryan mueller peter warrack patel controls limited know wouldnt want discount theres still amount people operate cash criminals know bit atm machine provides instant sort onramp fiat crypto smaller scale transactions provides instant delivery virtual currency reliance third party hold assets instantly delivered purchasing like buying chocolate bar speak think another use case virtual atm machines also know good bad suppose frauds explosions things like quadriga people become increasingly concerned sharing personal information sign onto platform youre subject identity verification requirements based opening account youre required provide identity documents potentially proof address document potentially know countries youre required provide social insurance social security',
 'karen best martland exhibit affidavit corporal karen best made february redacted martland thank corporal madam registrar dont think need affidavit fact ill leave screen participants fi

In [7]:
print(len(corpus))
print(corpus[:5])

17431
['charlene cieslik giles dixon ryan mueller peter warrack patel controls limited know wouldnt want discount theres still amount people operate cash criminals know bit atm machine provides instant sort onramp fiat crypto smaller scale transactions provides instant delivery virtual currency reliance third party hold assets instantly delivered purchasing like buying chocolate bar speak think another use case virtual atm machines also know good bad suppose frauds explosions things like quadriga people become increasingly concerned sharing personal information sign onto platform youre subject identity verification requirements based opening account youre required provide identity documents potentially proof address document potentially know countries youre required provide social insurance social security', 'karen best martland exhibit affidavit corporal karen best made february redacted martland thank corporal madam registrar dont think need affidavit fact ill leave screen participan

## Data Pre-processing

In [18]:
# Further pre-processing:
#     Tokenization
#     Lemmatization
#     Stemming
%pip install gensim
%pip install nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')

/bin/bash: /home/schwing/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.
/bin/bash: /home/schwing/miniconda3/envs/tf/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package wordnet to /home/schwing/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
# Write functions to perform lemmatize, stem, remove short words on the dataset

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

In [23]:
# Sample document to preview before and after preprocessing.

doc_sample = corpus[corpus == 400]
stemmer = SnowballStemmer("english")
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['charlene', 'cieslik', 'giles', 'dixon', 'ryan', 'mueller', 'peter', 'warrack', 'patel', 'controls', 'limited', 'know', 'wouldnt', 'want', 'discount', 'theres', 'still', 'amount', 'people', 'operate', 'cash', 'criminals', 'know', 'bit', 'atm', 'machine', 'provides', 'instant', 'sort', 'onramp', 'fiat', 'crypto', 'smaller', 'scale', 'transactions', 'provides', 'instant', 'delivery', 'virtual', 'currency', 'reliance', 'third', 'party', 'hold', 'assets', 'instantly', 'delivered', 'purchasing', 'like', 'buying', 'chocolate', 'bar', 'speak', 'think', 'another', 'use', 'case', 'virtual', 'atm', 'machines', 'also', 'know', 'good', 'bad', 'suppose', 'frauds', 'explosions', 'things', 'like', 'quadriga', 'people', 'become', 'increasingly', 'concerned', 'sharing', 'personal', 'information', 'sign', 'onto', 'platform', 'youre', 'subject', 'identity', 'verification', 'requirements', 'based', 'opening', 'account', 'youre', 'required', 'provide', 'identity', 'documents', 'potenti

In [33]:
# convert list to dataframe
df = pd.DataFrame(corpus)
print(df)

                                                       0
0      charlene cieslik giles dixon ryan mueller pete...
1      karen best martland exhibit affidavit corporal...
2      kenneth ackles chewka professional relationshi...
3      len meilleur latimer delbigio well proper ques...
4      daryl tottenham latimer continuing ultimately ...
...                                                  ...
17426  daryl tottenham gruber focus interview may bri...
17427  bert pereboom wahid abdallah rose furnishing d...
17428  daryl tottenham latimer correct yeah understan...
17429  len meilleur mcfee made clear read legal opini...
17430  robert barber rajotte concerning im sure grasp...

[17431 rows x 1 columns]


In [34]:
df['transcript_page'] = df[0] 
print(df[:5])

                                                   0  \
0  charlene cieslik giles dixon ryan mueller pete...   
1  karen best martland exhibit affidavit corporal...   
2  kenneth ackles chewka professional relationshi...   
3  len meilleur latimer delbigio well proper ques...   
4  daryl tottenham latimer continuing ultimately ...   

                                     transcript_page  
0  charlene cieslik giles dixon ryan mueller pete...  
1  karen best martland exhibit affidavit corporal...  
2  kenneth ackles chewka professional relationshi...  
3  len meilleur latimer delbigio well proper ques...  
4  daryl tottenham latimer continuing ultimately ...  


In [35]:
processed_df = df['transcript_page'].map(preprocess)
processed_df[:10]

0    [charlen, cieslik, gile, dixon, ryan, mueller,...
1    [karen, best, martland, exhibit, affidavit, co...
2    [kenneth, ackl, chewka, profession, relationsh...
3    [len, meilleur, latim, delbigio, proper, quest...
4    [daryl, tottenham, latim, continu, ultim, goal...
5    [fred, pinnock, senkpiel, didnt, refer, conver...
6    [sam, macleod, mccleeri, close, impact, assess...
7    [ezekiel, chhoa, lindze, herring, erin, tolfo,...
8    [larri, vander, graaf, latim, time, entri, did...
9    [charlen, cieslik, gile, dixon, ryan, mueller,...
Name: transcript_page, dtype: object

# Bag of Words on the Dataset

## Create a dictionary from the processed pages

In [38]:
# dictionary created from 'processed_df' with the number of times a word appears 
# in the training set

dictionary = gensim.corpora.Dictionary(processed_df)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 account
1 address
2 asset
3 atm
4 bad
5 bar
6 base
7 bite
8 buy
9 case
10 cash


In [46]:
# Gensim filter_extremes
# Filter out tokens that appear in less than 1 documents (absolute number ) OR
# More than 0.25 documents (fraction of total corpus size, not absolute)
# After the above steps, keep only the first 500,000 most frequent tokens

dictionary.filter_extremes(no_below=10, no_above=0.25, keep_n=500000)

In [47]:
# Gensim doc2bow
# For each document, create a dictionary reporting how many words, how many times words appear

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_df]
bow_corpus[400]

[(2, 2),
 (11, 1),
 (12, 1),
 (18, 1),
 (22, 1),
 (26, 1),
 (37, 1),
 (41, 1),
 (42, 1),
 (44, 1),
 (48, 2),
 (52, 3),
 (53, 1),
 (64, 2),
 (66, 1),
 (69, 3),
 (71, 1),
 (73, 1),
 (112, 1),
 (133, 1),
 (134, 2),
 (163, 2),
 (184, 1),
 (187, 1),
 (197, 7),
 (219, 1),
 (220, 1),
 (232, 8),
 (240, 2),
 (327, 1),
 (350, 1),
 (375, 3),
 (386, 2),
 (511, 2),
 (590, 1),
 (633, 1),
 (638, 2),
 (682, 2),
 (721, 1),
 (730, 2),
 (759, 1),
 (799, 1),
 (866, 1),
 (891, 1),
 (930, 1),
 (961, 2),
 (1015, 1),
 (1071, 1),
 (1196, 1),
 (1423, 3),
 (1483, 2),
 (1580, 2),
 (1621, 1),
 (1624, 1),
 (1635, 1),
 (1668, 1),
 (1876, 1)]

In [48]:
# Preview bag of words for sample prepreprocessed document

bow_doc_400 = bow_corpus[400]

for i in range(len(bow_doc_400)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_400[i][0], 
                                               dictionary[bow_doc_400[i][0]], 
bow_doc_400[i][1]))

Word 2 ("asset") appears 2 time.
Word 11 ("charlen") appears 1 time.
Word 12 ("cieslik") appears 1 time.
Word 18 ("currenc") appears 1 time.
Word 22 ("dixon") appears 1 time.
Word 26 ("gile") appears 1 time.
Word 37 ("mueller") appears 1 time.
Word 41 ("patel") appears 1 time.
Word 42 ("peopl") appears 1 time.
Word 44 ("peter") appears 1 time.
Word 48 ("provid") appears 2 time.
Word 52 ("requir") appears 3 time.
Word 53 ("ryan") appears 1 time.
Word 64 ("there") appears 2 time.
Word 66 ("transact") appears 1 time.
Word 69 ("virtual") appears 3 time.
Word 71 ("warrack") appears 1 time.
Word 73 ("your") appears 1 time.
Word 112 ("term") appears 1 time.
Word 133 ("direct") appears 1 time.
Word 134 ("effect") appears 2 time.
Word 163 ("canada") appears 2 time.
Word 184 ("regim") appears 1 time.
Word 187 ("say") appears 1 time.
Word 197 ("busi") appears 7 time.
Word 219 ("kind") appears 1 time.
Word 220 ("mean") appears 1 time.
Word 232 ("servic") appears 8 time.
Word 240 ("add") appears 2 

# TF-IDF

In [49]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.05469279212051635),
 (1, 0.05750302234354389),
 (2, 0.07254813306482037),
 (3, 0.215974457123673),
 (4, 0.09785571658848884),
 (5, 0.09409235070097963),
 (6, 0.05801249324035406),
 (7, 0.0522294901252956),
 (8, 0.07636850012164986),
 (9, 0.04701565766669469),
 (10, 0.03989364902039228),
 (11, 0.10442489446051363),
 (12, 0.10455210845598799),
 (13, 0.04925785816284791),
 (14, 0.07261103903608576),
 (15, 0.06363754413527692),
 (16, 0.04757694312532212),
 (17, 0.14533945878122467),
 (18, 0.08642058643882346),
 (19, 0.10145515037811564),
 (20, 0.1165163546652653),
 (21, 0.15576946981724146),
 (22, 0.10455210845598799),
 (23, 0.08240124328195654),
 (24, 0.1411761728097476),
 (25, 0.08193285497319364),
 (26, 0.10455210845598799),
 (27, 0.05430804602362193),
 (28, 0.07387367002443304),
 (29, 0.2187246347706395),
 (30, 0.0714145810045944),
 (31, 0.03259585381453226),
 (32, 0.47908093558256565),
 (33, 0.12182825061204956),
 (34, 0.06351910892052415),
 (35, 0.06598680896866942),
 (36, 0.2

# LDA using BoW

In [55]:
# Train LDA model using gensim.models.LdaMulticore and save it as 'lda_model'
# topics = 20
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.013*"law" + 0.009*"recommend" + 0.009*"inform" + 0.008*"societi" + 0.008*"govern" + 0.007*"rise" + 0.007*"work" + 0.007*"enforc" + 0.007*"real" + 0.006*"estat"
Topic: 1 
Words: 0.014*"inform" + 0.010*"compani" + 0.009*"public" + 0.008*"canada" + 0.007*"person" + 0.006*"account" + 0.006*"law" + 0.006*"there" + 0.005*"act" + 0.005*"crime"
Topic: 2 
Words: 0.018*"polic" + 0.016*"gpeb" + 0.014*"investig" + 0.011*"meet" + 0.010*"offic" + 0.009*"casino" + 0.009*"inform" + 0.009*"bclc" + 0.009*"understand" + 0.008*"rcmp"
Topic: 3 
Words: 0.015*"bclc" + 0.013*"provid" + 0.012*"gpeb" + 0.009*"work" + 0.007*"inform" + 0.007*"servic" + 0.006*"sourc" + 0.006*"come" + 0.006*"investig" + 0.006*"review"
Topic: 4 
Words: 0.011*"peopl" + 0.010*"cash" + 0.008*"there" + 0.008*"thing" + 0.007*"risk" + 0.007*"like" + 0.007*"sort" + 0.007*"look" + 0.006*"way" + 0.006*"busi"
Topic: 5 
Words: 0.037*"document" + 0.027*"exhibit" + 0.017*"registrar" + 0.016*"okay" + 0.015*"page" + 0.011*"madam

In [57]:
# Train LDA model using gensim.models.LdaMulticore and save it as 'lda_model'
# topics = 10
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"risk" + 0.008*"look" + 0.007*"account" + 0.007*"isaac" + 0.007*"law" + 0.006*"regul" + 0.006*"rule" + 0.006*"work" + 0.005*"question" + 0.005*"feder"
Topic: 1 
Words: 0.024*"investig" + 0.012*"inform" + 0.011*"crime" + 0.009*"law" + 0.008*"polic" + 0.008*"enforc" + 0.007*"benefici" + 0.007*"registri" + 0.007*"crimin" + 0.006*"unit"
Topic: 2 
Words: 0.012*"work" + 0.012*"mortgag" + 0.010*"data" + 0.010*"broker" + 0.009*"martland" + 0.007*"document" + 0.007*"peopl" + 0.007*"page" + 0.006*"number" + 0.006*"investig"
Topic: 3 
Words: 0.049*"commission" + 0.020*"registrar" + 0.018*"wit" + 0.012*"question" + 0.011*"hear" + 0.008*"madam" + 0.008*"minut" + 0.008*"exhibit" + 0.007*"proceed" + 0.007*"ask"
Topic: 4 
Words: 0.008*"countri" + 0.007*"crime" + 0.007*"inform" + 0.007*"canada" + 0.005*"data" + 0.005*"way" + 0.005*"there" + 0.005*"work" + 0.005*"order" + 0.005*"transact"
Topic: 5 
Words: 0.013*"casino" + 0.009*"investig" + 0.008*"concern" + 0.007*"come" + 0.007*"

# LDA using TF-IDF

In [58]:
# topics = 10
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"mortgag" + 0.006*"broker" + 0.003*"commission" + 0.003*"patel" + 0.003*"registrar" + 0.003*"document" + 0.003*"investig" + 0.003*"real" + 0.003*"exhibit" + 0.003*"estat"
Topic: 1 Word: 0.004*"commission" + 0.003*"investig" + 0.003*"inform" + 0.003*"document" + 0.002*"question" + 0.002*"canada" + 0.002*"page" + 0.002*"exhibit" + 0.002*"look" + 0.002*"casino"
Topic: 2 Word: 0.004*"inform" + 0.003*"data" + 0.003*"bank" + 0.003*"slide" + 0.003*"investig" + 0.003*"canada" + 0.002*"document" + 0.002*"account" + 0.002*"commission" + 0.002*"transact"
Topic: 3 Word: 0.005*"commission" + 0.004*"exhibit" + 0.004*"martland" + 0.003*"inform" + 0.003*"registrar" + 0.003*"document" + 0.003*"countri" + 0.003*"investig" + 0.003*"work" + 0.003*"wit"
Topic: 4 Word: 0.004*"account" + 0.003*"law" + 0.003*"investig" + 0.003*"crime" + 0.003*"inform" + 0.003*"rule" + 0.003*"lawyer" + 0.003*"feder" + 0.003*"order" + 0.003*"case"
Topic: 5 Word: 0.004*"compani" + 0.003*"benefici" + 0.003*"r

In [59]:
# topics = 20
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"mortgag" + 0.005*"broker" + 0.004*"rise" + 0.003*"registri" + 0.003*"real" + 0.003*"inform" + 0.003*"regist" + 0.003*"compani" + 0.003*"estat" + 0.003*"commission"
Topic: 1 Word: 0.003*"page" + 0.003*"document" + 0.003*"data" + 0.003*"commission" + 0.003*"transact" + 0.003*"investig" + 0.003*"bell" + 0.003*"okay" + 0.003*"registrar" + 0.002*"inform"
Topic: 2 Word: 0.005*"account" + 0.004*"cash" + 0.004*"transact" + 0.003*"profession" + 0.003*"inform" + 0.003*"data" + 0.003*"casino" + 0.003*"case" + 0.003*"investig" + 0.003*"involv"
Topic: 3 Word: 0.004*"bclc" + 0.004*"iiget" + 0.004*"cash" + 0.004*"gpeb" + 0.004*"investig" + 0.003*"casino" + 0.003*"recal" + 0.003*"understand" + 0.003*"minist" + 0.003*"come"
Topic: 4 Word: 0.003*"cash" + 0.003*"work" + 0.003*"law" + 0.003*"lawyer" + 0.003*"rule" + 0.003*"societi" + 0.003*"player" + 0.003*"bclc" + 0.003*"rock" + 0.003*"investig"
Topic: 5 Word: 0.003*"atm" + 0.003*"inform" + 0.003*"casino" + 0.003*"cash" + 0.003*"cri

## Evaluate LDA model for BoW and TF-IDF

In [60]:
processed_df[400]

['charlen',
 'cieslik',
 'gile',
 'dixon',
 'ryan',
 'mueller',
 'peter',
 'warrack',
 'patel',
 'regim',
 'start',
 'defin',
 'virtual',
 'asset',
 'servic',
 'provid',
 'defin',
 'deal',
 'virtual',
 'asset',
 'mean',
 'effect',
 'bring',
 'place',
 'registr',
 'requir',
 'captur',
 'definit',
 'money',
 'servic',
 'busi',
 'there',
 'know',
 'there',
 'qualifi',
 'transact',
 'activ',
 'tradit',
 'money',
 'servic',
 'busi',
 'trade',
 'deal',
 'negoti',
 'instrument',
 'effect',
 'kind',
 'transfer',
 'fund',
 'behalf',
 'theyv',
 'add',
 'deal',
 'virtual',
 'currenc',
 'definit',
 'say',
 'bring',
 'registr',
 'requir',
 'your',
 'requir',
 'regist',
 'busi',
 'fintrac',
 'add',
 'element',
 'distinguish',
 'local',
 'money',
 'servic',
 'busi',
 'foreign',
 'money',
 'servic',
 'busi',
 'clarifi',
 'definit',
 'foreign',
 'money',
 'servic',
 'busi',
 'term',
 'engag',
 'engag',
 'activ',
 'place',
 'busi',
 'canada',
 'direct',
 'servic',
 'peopl',
 'live',
 'canada',
 'provid'

In [63]:
# Evaluation of LDA with BoW
for index, score in sorted(lda_model[bow_corpus[400]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9663997292518616	 
Topic: 0.009*"risk" + 0.008*"look" + 0.007*"account" + 0.007*"isaac" + 0.007*"law" + 0.006*"regul" + 0.006*"rule" + 0.006*"work" + 0.005*"question" + 0.005*"feder"

Score: 0.02499489299952984	 
Topic: 0.017*"cash" + 0.009*"like" + 0.008*"peopl" + 0.008*"say" + 0.007*"come" + 0.007*"bank" + 0.007*"want" + 0.007*"thing" + 0.006*"yeah" + 0.006*"look"


In [66]:
# Evaluation of LDA with TF-IDF, topic 0 and 12
for index, score in sorted(lda_model_tfidf[corpus_tfidf[400]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6563827991485596	 
Topic: 0.006*"mortgag" + 0.005*"broker" + 0.004*"rise" + 0.003*"registri" + 0.003*"real" + 0.003*"inform" + 0.003*"regist" + 0.003*"compani" + 0.003*"estat" + 0.003*"commission"

Score: 0.22173601388931274	 
Topic: 0.007*"commission" + 0.006*"registrar" + 0.005*"wit" + 0.004*"exhibit" + 0.004*"document" + 0.004*"madam" + 0.003*"page" + 0.003*"question" + 0.003*"casino" + 0.003*"patel"
