In [17]:
import os
import gensim
from gensim import corpora
from gensim.models import LdaModel

#### Series wise Topic Detection

First we need to replace the chapter names with the correct chapter order

In [1]:
# same function in test_eda.ipynb so need to put this in one place in the the scripts
def chapter_order(filename):

    book_number = filename.split('_')[0]
    chapter_number = filename.split('_')[2].zfill(2)
    new_name = book_number + '.' + chapter_number
    
    return new_name

Creating Chapter name and text dictionary

In [6]:
# Combine all chapters to one list of chapter text
# one element of the list = one chapter
# Just like text_sentiment_analysis.ipynb

mb_dir = '../2_Text_Preprocessing/TMBD_Chapters_lemma'

mb_chapter_texts = {}

for filename in os.listdir(mb_dir):
    filepath =  os.path.join(mb_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        mb_chapter_texts[chapter_order(filename)] = chapter_text.split()

In [7]:
# Combine all chapters to one list of chapter text
# one element of the list = one chapter
# same as test_sentiment_analysis.ipynb

ir_dir = '../2_Text_Preprocessing/IR_Chapters_lemma'

ir_chapter_texts = {}

for filename in os.listdir(ir_dir):
    filepath =  os.path.join(ir_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        ir_chapter_texts[chapter_order(filename)] = chapter_text.split()

Now we have both dictionaries with chapter names as keys and chapter text as values.

In [16]:
mb_corpus_dict = corpora.Dictionary([values for key, values in mb_chapter_texts.items()])
mb_corpus = [mb_corpus_dict.doc2bow(values) for key, values in mb_chapter_texts.items()]

In [18]:
ir_corpus_dict = corpora.Dictionary([values for key, values in ir_chapter_texts.items()])
ir_corpus = [ir_corpus_dict.doc2bow(values) for key, values in ir_chapter_texts.items()]

In [19]:
mb_lda_model = LdaModel(corpus=mb_corpus, id2word=mb_corpus_dict, num_topics=5, random_state=42, passes=15)

mb_topics = mb_lda_model.print_topics(num_words=5)

for topic in mb_topics:
    print(topic)

(0, '0.018*"said" + 0.012*"art" + 0.007*"know" + 0.007*"humans" + 0.007*"like"')
(1, '0.009*"said" + 0.008*"feed" + 0.008*"humans" + 0.008*"one" + 0.007*"didnt"')
(2, '0.013*"said" + 0.007*"could" + 0.007*"humans" + 0.007*"would" + 0.007*"feed"')
(3, '0.007*"said" + 0.006*"art" + 0.006*"humans" + 0.006*"overse" + 0.005*"could"')
(4, '0.009*"said" + 0.007*"station" + 0.007*"like" + 0.007*"indah" + 0.007*"humans"')


In [20]:
ir_lda_model = LdaModel(corpus=ir_corpus, id2word=ir_corpus_dict, num_topics=5, random_state=42, passes=15)

ir_topics = ir_lda_model.print_topics(num_words=5)

for topic in ir_topics:
    print(topic)

(0, '0.018*"station" + 0.012*"said" + 0.012*"seivarden" + 0.010*"sword" + 0.008*"would"')
(1, '0.017*"lieutenant" + 0.013*"said" + 0.011*"would" + 0.011*"one" + 0.009*"awn"')
(2, '0.016*"said" + 0.010*"captain" + 0.009*"would" + 0.008*"one" + 0.007*"seivarden"')
(3, '0.010*"said" + 0.009*"would" + 0.009*"one" + 0.006*"could" + 0.005*"lieutenant"')
(4, '0.007*"kalr" + 0.006*"ship" + 0.006*"would" + 0.005*"medic" + 0.005*"one"')


#### Series wise TF-IDF for Theme Keywords

In [21]:
from sklearn. feature_extraction.text import TfidfVectorizer

In [27]:
mb_vectorizer = TfidfVectorizer(stop_words='english')

mb_tfidf_matrix = mb_vectorizer.fit_transform([" ".join(values) for key, values in mb_chapter_texts.items()])

In [35]:
mb_feature_names = mb_vectorizer.get_feature_names_out()

mb_tfidf_sums = mb_tfidf_matrix.sum(axis=0)
mb_sorted_indices = mb_tfidf_sums.argsort()[0, ::-1]

mb_top_words = [mb_feature_names[i] for i in mb_sorted_indices[:10]]
for word in mb_top_words[0][0][:10]:
    print(word)

said
humans
didnt
feed
art
like
know
ratthi
mensah
human


In [30]:
ir_vectorizer = TfidfVectorizer(stop_words='english')

ir_tfidf_matrix = ir_vectorizer.fit_transform([" ".join(values) for key, values in ir_chapter_texts.items()])

In [36]:
ir_feature_names = ir_vectorizer.get_feature_names_out()

ir_tfidf_sums = ir_tfidf_matrix.sum(axis=0)
ir_sorted_indices = ir_tfidf_sums.argsort()[0, ::-1]

ir_top_words = [ir_feature_names[i] for i in ir_sorted_indices[:10]]
for word in ir_top_words[0][0][:10]:
    print(word)

said
lieutenant
seivarden
captain
station
tisarwat
didnt
fleet
ship
know


So not much difference from the word frequencies in test_eda.ipynb

- both texts have "said" as a theme which makes sense for books made of a lot of dialogue or any kind of written fiction.

- in MB ratthi is a topic higher than mensah, which is interesting. ART is also higher up and humans still are a central theme. despite this series being about machine intelligences, the focus is entirely human. interesting.

- in IR series, the titles are still as important and what i take to be main characters. 

#### Next thing: chapter wise theme and topic detection. 