In [1]:
%load_ext autoreload
%autoreload 2

In [41]:
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer,
                                             TfidfVectorizer)
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from os import getcwd, path
import os
from tqdm.autonotebook import tqdm

#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# Gensim
import gensim
import gensim.corpora as corpora

# plotting
from matplotlib import pyplot as plt

In [3]:
from modern_slavery_registry.utils import (sort_dict)

In [4]:
# use if autocompletion is not working
%config Completer.use_jedi = False

In [5]:
RANDOM_STATE = 40

In [37]:
PROJECT_PATH = getcwd()
PROJECT_PATH = PROJECT_PATH.replace("\\notebooks", "")
DATA_PATH = PROJECT_PATH + "\\data"
SHEETS_PATH = DATA_PATH + "\\sheets"

In [9]:
data = pd.read_excel(f"{SHEETS_PATH}\\subset_data.xlsx")
data.fillna("#NA", inplace=True)
data = data[["URL", "final_statement_cleaned"]]
n_sentences = len(data)
print(f"Found {n_sentences} non-NA statements")

Found 10078 non-NA statements


In [10]:
data.head()

Unnamed: 0,URL,final_statement_cleaned
0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,km sh foor eum hold europe ltd aldersgate stre...
1,https://1spatial.com/who-we-are/legal/modern-s...,modern slavery act policy statement home solut...
2,https://www.shazans.com/slavery-and-human-traf...,slavery human traffic statement shazans shazan...
3,https://www.business-humanrights.org/sites/def...,modern slavery atement atement make pursuant s...
4,https://www.2agriculture.com/wp-content/upload...,fh modern slavery act slavery human traffic st...


In [11]:
NGRAMS = (2, 2)
ngram = np.max(NGRAMS)
ngram_sentences = []

for sentence in tqdm(data["final_statement_cleaned"].values):
    sentence = sentence.split()
    ngram_sentence = []
    len_sentence = len(sentence)
    for i in range(len_sentence-ngram + 1):
        ngram_sentence.append(" ".join(sentence[i:i+ngram]))
#     # preparing ngrams at end of sentence
#     for i in range(len_sentence-ngram+1, len_sentence):
#         ngram_sentence.append(" ".join(
#             sentence[i:] + ["$PAD$"] * (ngram -  len(sentence[i :]))))
    ngram_sentences.append(ngram_sentence)

  0%|          | 0/10078 [00:00<?, ?it/s]

In [13]:
print(ngram_sentences[0][:20])

['km sh', 'sh foor', 'foor eum', 'eum hold', 'hold europe', 'europe ltd', 'ltd aldersgate', 'aldersgate street', 'street london', 'london ecia', 'ecia hd', 'hd tel', 'tel mail', 'mail keulongen', 'keulongen uk', 'uk kline', 'kline com', 'com modern', 'modern slavery', 'slavery act']


In [15]:
ngram_term_freq = {} # to keep track of term frequency
ngram_document_freq = {} # to keep track of document-term frequency
ngram_last_doc = {}
for i, ngram_sentence in tqdm(enumerate(ngram_sentences)):
    for ngram in ngram_sentence:  
        if ngram not in ngram_term_freq:
            ngram_term_freq[ngram] = 1
            ngram_document_freq[ngram] = 1
        else:
            ngram_term_freq[ngram] += 1
            if ngram_last_doc[ngram] != i:
                ngram_document_freq[ngram] += 1
        ngram_last_doc[ngram] = i
        
ngram_document_freq = {ngram: freq/n_sentences for ngram, freq in ngram_document_freq.items()} 
del ngram_last_doc

|          | 0/? [00:00<?, ?it/s]

In [16]:
print(f"Vocab size: {len(ngram_term_freq)}") # without padding last ngrams word in each sentence

Vocab size: 1284373


In [18]:
ngram_stat_table = pd.DataFrame({"ngram": ngram_term_freq.keys(), 
                                 "term_freq": ngram_term_freq.values(),
                                 "doc_freq": ngram_document_freq.values()})
ngram_stat_table.describe()

Unnamed: 0,term_freq,doc_freq
count,1284373.0,1284373.0
mean,4.79701,0.0003908479
std,138.1305,0.003576183
min,1.0,9.922604e-05
25%,1.0,9.922604e-05
50%,1.0,9.922604e-05
75%,2.0,0.0001984521
max,90160.0,0.9295495


In [21]:
MIN_DF = .001 
MAX_DF = .1  
ngram = np.max(NGRAMS)
ngram_covered = len(ngram_stat_table[ngram_stat_table["doc_freq"].between(MIN_DF, MAX_DF)])
print(f"{ngram}-grams vocab size with doc frequency ({MIN_DF}, {MAX_DF}): "
      f"{ngram_covered}")
print(f"{ngram}-grams vocab size with doc frequency ({MIN_DF}, {MAX_DF}): "
      f"{ngram_covered*100/len(ngram_document_freq):.3f} %")

2-grams vocab size with doc frequency (0.001, 0.1): 59789
2-grams vocab size with doc frequency (0.001, 0.1): 4.655 %


In [22]:
count_vect = CountVectorizer(ngram_range=NGRAMS, min_df=MIN_DF, max_df=MAX_DF)
X = count_vect.fit_transform(data["final_statement_cleaned"].values) 
print(f"shape: {X.shape}")

shape: (10078, 59789)


In [23]:
word2idx = count_vect.vocabulary_
idx2word = {idx: word for word, idx in word2idx.items()}

data_for_model = []
for row in tqdm(X.toarray()):
    idxs = np.where(row > 0)
    data_for_model.append([(idx, row[idx]) for idx in idxs[0]])

  0%|          | 0/10078 [00:00<?, ?it/s]

In [24]:
%%time
# Build LDA model
N_TOPICS = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=data_for_model,
                                           id2word=idx2word,
                                           num_topics=N_TOPICS, 
                                           random_state=RANDOM_STATE,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Wall time: 2min 56s


In [25]:
for topic in lda_model.print_topics():
    print(f"{topic[0]}: {topic[1:]}")
    print()

0: ('0.005*"follow policies" + 0.005*"potential modern" + 0.005*"review exist" + 0.005*"understand potential" + 0.005*"exist suppliers" + 0.005*"aim ensure" + 0.005*"concern relate" + 0.005*"activities supply" + 0.005*"risk relate" + 0.004*"relate business"',)

1: ('0.003*"areas supply" + 0.003*"professional service" + 0.003*"compliance modern" + 0.002*"slavery traffic" + 0.002*"reduce risk" + 0.002*"limit company" + 0.002*"occur within" + 0.002*"combat modern" + 0.002*"end st" + 0.002*"assess potential"',)

2: ('0.014*"espa ol" + 0.007*"compliance company" + 0.007*"customer care" + 0.006*"business organization" + 0.005*"relevant employment" + 0.005*"mental physical" + 0.005*"knowingly support" + 0.005*"find involve" + 0.005*"comply provision" + 0.004*"include reference"',)

3: ('0.006*"modern day" + 0.006*"day slavery" + 0.005*"financial conduct" + 0.005*"conduct authority" + 0.004*"regulate financial" + 0.004*"authorize regulate" + 0.003*"long stand" + 0.003*"gender pay" + 0.003*"reg

In [62]:
# common_texts = [['interface', 'computer', 'computer', 'computer'],
#                 ['survey', 'user', 'computer', 'system', 'response', 'time'],
#                 ['eps', 'user', 'interface', 'system'],
#                 ['system', 'system', 'eps'],
#                 ['user', 'response', 'time'],
#                 ['trees'],
#                 ['graph', 'trees', 'zebra'],
#                 ['graph', 'minors', 'trees', 'you', 'you', 'you'],
#                 ['graph', 'minors', 'survey', 'human', 'human', 'human', 'human', 'human', 'human', 'human', 'human']]
# common_dictionary = corpora.Dictionary(common_texts)
# [common_dictionary.doc2bow(text) for text in common_texts]

In [45]:
mallet_path = PROJECT_PATH + "\\models\\mallet-2.0.8"
os.environ['MALLET_HOME'] = mallet_path 
mallet_path = mallet_path + "\\bin\\mallet.bat"

ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, 
                                             corpus=data_for_model,
                                             num_topics=N_TOPICS,
                                             id2word=idx2word, 
                                             random_seed=RANDOM_STATE)

In [47]:
for topic in ldamallet.show_topics():
    print(f"{topic[0]}: {topic[1:]}")
    print()

0: ('nan*"promote use" + nan*"promote understand" + nan*"promote workplace" + nan*"promote work" + nan*"promote welfare" + nan*"promote value" + nan*"promotion human" + nan*"promote support" + nan*"promote ten" + nan*"promotional goods"',)

1: ('nan*"promote use" + nan*"promote understand" + nan*"promote workplace" + nan*"promote work" + nan*"promote welfare" + nan*"promote value" + nan*"promotion human" + nan*"promote support" + nan*"promote ten" + nan*"promotional goods"',)

2: ('nan*"promote use" + nan*"promote understand" + nan*"promote workplace" + nan*"promote work" + nan*"promote welfare" + nan*"promote value" + nan*"promotion human" + nan*"promote support" + nan*"promote ten" + nan*"promotional goods"',)

3: ('nan*"promote use" + nan*"promote understand" + nan*"promote workplace" + nan*"promote work" + nan*"promote welfare" + nan*"promote value" + nan*"promotion human" + nan*"promote support" + nan*"promote ten" + nan*"promotional goods"',)

4: ('nan*"promote use" + nan*"promot

  topic = topic / topic.sum()  # normalize to probability dist
