In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import adhtools.utils
import pandas as pd
import os
import numpy as np

## Load corpus

In [None]:
import glob

in_dir = '/media/sf_VBox_Shared/Arabic/Fiqh/2019-02-08-fiqh-newfiles-light10-chapters/'
book_files = list(glob.glob(os.path.join(in_dir, '*.xml')))[:3]
fnames = [os.path.basename(fn) for fn in book_files]
print(len(book_files))

In [None]:
path_to_stopwordlist = '/media/sf_VBox_Shared/Arabic/arabic_stop-words_7-8-2018.txt'
external_stopwords = [line.strip() for line in open(path_to_stopwordlist, 'r', encoding='utf-8')]

In [None]:
corpus = adhtools.utils.corpus_str(book_files, analyzer=False, field='proposed_root')

## Make document-term matrix

In [None]:
vectorizer = CountVectorizer( stop_words=external_stopwords, min_df=2, max_df=0.9)
X = vectorizer.fit_transform(corpus)

In [None]:
feature_names = vectorizer.get_feature_names()
len(feature_names)

In [None]:
# some example words from the first document
np.array(feature_names)[X[0].indices][:50]

In [None]:
most_imp_words = pd.Series(np.array(feature_names)[X.argmax(axis=1)].flatten())
most_imp_words.value_counts().head(20)

## Train model

In [None]:
nr_topics = 20

In [None]:
import lda
model = lda.LDA(n_topics=nr_topics, n_iter=1, random_state=1)
model.fit(X)

In [None]:
# lda = LatentDirichletAllocation(n_components=nr_topics, random_state=0, max_iter=50)
# document_topics = lda.fit_transform(X)

In [None]:
document_topics = model.doc_topic_

## Save files

In [None]:
fp_out = '/media/sf_VBox_Shared/Arabic/Analyses/Fiqh_final/topicmodelling/'

In [None]:
import pickle
with open(os.path.join(fp_out, 'model_light10_{}.pkl'.format(nr_topics)), 'wb') as f:
    pickle.dump(model, f)

In [None]:
df_document_topics = pd.DataFrame(document_topics, index=fnames)
df_document_topics.to_csv(os.path.join(fp_out, 'fiqh_light10_document_topics_{}.csv'.format(nr_topics)))

In [None]:
#lda.components_.shape
model.components_.shape

In [None]:
topic_words = pd.DataFrame(np.argsort(model.components_, axis=1)[:,-10:][::-1])

In [None]:
topic_words = topic_words.applymap(lambda l: feature_names[l])

In [None]:
topic_words.to_csv(os.path.join(fp_out, 'fiqh_light10_topics_{}.csv'.format(nr_topics)))

In [None]:
topic_words