In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import adhtools.utils
import pandas as pd
import os
import numpy as np

## Load corpus

In [None]:
import glob

in_dir = '/media/sf_VBox_Shared/Arabic/Fiqh/2019-01-10-Fiqh-LIGHT10-chapters/'
book_files = glob.glob(in_dir, '*.xml')
fnames = [os.path.basename(fn) for fn in book_files]
print(len(book_files))

In [4]:
path_to_stopwordlist = '/media/sf_VBox_Shared/Arabic/arabic_stop-words_7-8-2018.txt'
external_stopwords = [line.strip() for line in open(path_to_stopwordlist, 'r', encoding='utf-8')]

In [5]:
corpus = adhtools.utils.corpus_str(book_files, analyzer=False, field='proposed_root')

## Make document-term matrix

In [6]:
vectorizer = CountVectorizer( stop_words=external_stopwords, min_df=2, max_df=0.9)
X = vectorizer.fit_transform(corpus)

In [7]:
feature_names = vectorizer.get_feature_names()
len(feature_names)

78526

In [8]:
# some example words from the first document
np.array(feature_names)[X[0].indices][:50]

array(['ايام', 'ببعد', 'قيد', 'حاشي', 'ورايت', 'توب', 'تقبل', 'ثلاث',
       'استتاب', 'نااب', 'ول', 'وقصاص', 'عدل', 'اهل', 'وباغ', 'رد',
       'مجمع', 'رتد', 'حا', 'لد', 'زنا', 'ثبوت', 'محصن', 'كز', 'فليحرر',
       'كغير', 'نيت', 'صريح', 'قرب', 'يحتاج', 'طاع', 'صح', 'نم', 'صيام',
       'يصح', 'عتق', 'كافر', 'ظاهر', 'بذم', 'تبق', 'برا', 'يرج', 'مرض',
       'لكبر', 'عجز', 'صوم', 'رقيق', 'كفر', 'جن', 'فصل'], dtype='<U13')

In [9]:
most_imp_words = pd.Series(np.array(feature_names)[X.argmax(axis=1)].flatten())
most_imp_words.value_counts().head(20)

ول      587
اب      452
صلا     373
اذ      313
صل      276
زكا     194
مال     189
بيع     189
حج      185
حد      183
ام      164
مالك    157
شهاد    125
قتل     115
عتق     114
طلاق    112
ارض      96
وص       96
امام     91
غسل      90
dtype: int64

## Train model

In [20]:
nr_topics = 60

In [21]:
import lda
model = lda.LDA(n_topics=nr_topics, n_iter=500, random_state=1)
model.fit(X)

INFO:lda:n_documents: 12107
INFO:lda:vocab_size: 78526
INFO:lda:n_words: 26487842
INFO:lda:n_topics: 60
INFO:lda:n_iter: 500
INFO:lda:<0> log likelihood: -329962036
INFO:lda:<10> log likelihood: -269955058
INFO:lda:<20> log likelihood: -241192245
INFO:lda:<30> log likelihood: -235990664
INFO:lda:<40> log likelihood: -233759236
INFO:lda:<50> log likelihood: -232532366
INFO:lda:<60> log likelihood: -231771662
INFO:lda:<70> log likelihood: -231248755
INFO:lda:<80> log likelihood: -230882717
INFO:lda:<90> log likelihood: -230569937
INFO:lda:<100> log likelihood: -230341985
INFO:lda:<110> log likelihood: -230158724
INFO:lda:<120> log likelihood: -230002953
INFO:lda:<130> log likelihood: -229880099
INFO:lda:<140> log likelihood: -229754821
INFO:lda:<150> log likelihood: -229615090
INFO:lda:<160> log likelihood: -229519526
INFO:lda:<170> log likelihood: -229427169
INFO:lda:<180> log likelihood: -229329622
INFO:lda:<190> log likelihood: -229257073
INFO:lda:<200> log likelihood: -229207947
INFO

KeyboardInterrupt: 

In [None]:
# lda = LatentDirichletAllocation(n_components=nr_topics, random_state=0, max_iter=50)
# document_topics = lda.fit_transform(X)

In [None]:
document_topics = model.doc_topic_

## Save files

In [None]:
fp_out = '/media/sf_VBox_Shared/Arabic/Analyses/Fiqh_final/topicmodelling/'

In [None]:
df_document_topics = pd.DataFrame(document_topics, index=fnames)
df_document_topics.to_csv(os.path.join(fp_out, 'fiqh_light10_document_topics_{}.csv'.format(nr_topics)))

In [None]:
#lda.components_.shape
model.components_.shape

In [None]:
topic_words = pd.DataFrame(np.argsort(model.components_, axis=1)[:,-10:][::-1])

In [None]:
topic_words = topic_words.applymap(lambda l: feature_names[l])

In [None]:
topic_words.to_csv(os.path.join(fp_out, 'fiqh_light10_topics_{}.csv'.format(nr_topics)))

In [None]:
topic_words