In [20]:
import os
import gensim
from gensim import corpora
from gensim.models import LdaModel

#### Series wise Topic Detection

First we need to replace the chapter names with the correct chapter order

In [21]:
# same function in test_eda.ipynb so need to put this in one place in the the scripts
def chapter_order(filename):

    book_number = filename.split('_')[0]
    chapter_number = filename.split('_')[2].zfill(2)
    new_name = book_number + '.' + chapter_number
    
    return new_name

Creating Chapter name and text dictionary

In [22]:
# Combine all chapters to one list of chapter text
# one element of the list = one chapter
# Just like text_sentiment_analysis.ipynb

mb_dir = '../2_Text_Preprocessing/TMBD_Chapters_lemma'

mb_chapter_texts = {}

for filename in os.listdir(mb_dir):
    filepath =  os.path.join(mb_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        mb_chapter_texts[chapter_order(filename)] = chapter_text.split()

In [23]:
# Combine all chapters to one list of chapter text
# one element of the list = one chapter
# same as test_sentiment_analysis.ipynb

ir_dir = '../2_Text_Preprocessing/IR_Chapters_lemma'

ir_chapter_texts = {}

for filename in os.listdir(ir_dir):
    filepath =  os.path.join(ir_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        ir_chapter_texts[chapter_order(filename)] = chapter_text.split()

Now we have both dictionaries with chapter names as keys and chapter text as values.

In [24]:
mb_corpus_dict = corpora.Dictionary([values for key, values in mb_chapter_texts.items()])
mb_corpus = [mb_corpus_dict.doc2bow(values) for key, values in mb_chapter_texts.items()]

In [25]:
ir_corpus_dict = corpora.Dictionary([values for key, values in ir_chapter_texts.items()])
ir_corpus = [ir_corpus_dict.doc2bow(values) for key, values in ir_chapter_texts.items()]

In [26]:
mb_lda_model = LdaModel(corpus=mb_corpus, id2word=mb_corpus_dict, num_topics=10, random_state=42, passes=15)

mb_topics = mb_lda_model.print_topics(num_words=6)

for topic in mb_topics:
    print(topic)

(0, '0.014*"said" + 0.008*"humans" + 0.008*"artdrone" + 0.006*"ratthi" + 0.006*"think" + 0.006*"know"')
(1, '0.008*"humans" + 0.008*"security" + 0.008*"would" + 0.008*"feed" + 0.007*"one" + 0.007*"didnt"')
(2, '0.013*"said" + 0.009*"would" + 0.008*"humans" + 0.008*"could" + 0.007*"feed" + 0.007*"miki"')
(3, '0.000*"said" + 0.000*"would" + 0.000*"didnt" + 0.000*"could" + 0.000*"art" + 0.000*"like"')
(4, '0.016*"station" + 0.015*"indah" + 0.010*"said" + 0.009*"aylen" + 0.007*"could" + 0.007*"security"')
(5, '0.008*"secunit" + 0.008*"targetcontrolsystem" + 0.007*"said" + 0.006*"humans" + 0.005*"3" + 0.005*"adacol1"')
(6, '0.015*"art" + 0.014*"said" + 0.008*"humans" + 0.007*"could" + 0.007*"like" + 0.007*"know"')
(7, '0.013*"said" + 0.008*"one" + 0.007*"didnt" + 0.007*"feed" + 0.007*"could" + 0.007*"humans"')
(8, '0.000*"said" + 0.000*"like" + 0.000*"humans" + 0.000*"one" + 0.000*"could" + 0.000*"would"')
(9, '0.006*"iris" + 0.005*"thiago" + 0.005*"chamber" + 0.005*"targets" + 0.005*"human

Need to remove "said", "would", "could" as stop words for the MB series.

In [27]:
ir_lda_model = LdaModel(corpus=ir_corpus, id2word=ir_corpus_dict, num_topics=10, random_state=42, passes=15)

ir_topics = ir_lda_model.print_topics(num_words=6)

for topic in ir_topics:
    print(topic)

(0, '0.018*"sword" + 0.011*"seivarden" + 0.010*"said" + 0.010*"station" + 0.009*"tisarwat" + 0.009*"atagaris"')
(1, '0.014*"would" + 0.014*"lieutenant" + 0.013*"one" + 0.012*"said" + 0.009*"mianaai" + 0.009*"awn"')
(2, '0.013*"said" + 0.007*"one" + 0.006*"captain" + 0.006*"could" + 0.006*"would" + 0.005*"didnt"')
(3, '0.010*"would" + 0.008*"said" + 0.008*"strigan" + 0.007*"seivarden" + 0.007*"one" + 0.007*"could"')
(4, '0.000*"said" + 0.000*"would" + 0.000*"one" + 0.000*"seivarden" + 0.000*"lieutenant" + 0.000*"could"')
(5, '0.014*"said" + 0.010*"would" + 0.009*"captain" + 0.009*"station" + 0.008*"one" + 0.008*"seivarden"')
(6, '0.015*"said" + 0.012*"lieutenant" + 0.010*"would" + 0.010*"one" + 0.007*"could" + 0.006*"awn"')
(7, '0.000*"would" + 0.000*"said" + 0.000*"one" + 0.000*"captain" + 0.000*"lieutenant" + 0.000*"seivarden"')
(8, '0.028*"lieutenant" + 0.012*"awn" + 0.011*"said" + 0.011*"one" + 0.007*"esk" + 0.006*"would"')
(9, '0.000*"said" + 0.000*"would" + 0.000*"lieutenant" + 0.

I need to remove "said", "would", "could", "didn't" as stop words for the IR series.

#### Series wise TF-IDF for Theme Keywords

In [28]:
from sklearn. feature_extraction.text import TfidfVectorizer

In [29]:
mb_vectorizer = TfidfVectorizer(stop_words='english')

mb_tfidf_matrix = mb_vectorizer.fit_transform([" ".join(values) for key, values in mb_chapter_texts.items()])

In [30]:
mb_feature_names = mb_vectorizer.get_feature_names_out()

mb_tfidf_sums = mb_tfidf_matrix.sum(axis=0)
mb_sorted_indices = mb_tfidf_sums.argsort()[0, ::-1]

mb_top_words = [mb_feature_names[i] for i in mb_sorted_indices[:10]]
for word in mb_top_words[0][0][:10]:
    print(word)

said
humans
didnt
feed
art
like
know
ratthi
mensah
human


In [31]:
ir_vectorizer = TfidfVectorizer(stop_words='english')

ir_tfidf_matrix = ir_vectorizer.fit_transform([" ".join(values) for key, values in ir_chapter_texts.items()])

In [32]:
ir_feature_names = ir_vectorizer.get_feature_names_out()

ir_tfidf_sums = ir_tfidf_matrix.sum(axis=0)
ir_sorted_indices = ir_tfidf_sums.argsort()[0, ::-1]

ir_top_words = [ir_feature_names[i] for i in ir_sorted_indices[:10]]
for word in ir_top_words[0][0][:10]:
    print(word)

said
lieutenant
seivarden
captain
station
tisarwat
didnt
fleet
ship
know


So not much difference from the word frequencies in test_eda.ipynb

- both texts have "said" as a theme which makes sense for books made of a lot of dialogue or any kind of written fiction.

- in MB ratthi is a topic higher than mensah, which is interesting. ART is also higher up and humans still are a central theme. despite this series being about machine intelligences, the focus is entirely human. interesting.

- in IR series, the titles are still as important and what i take to be main characters. 

#### Further data cleaning

In [39]:
new_stop_words = set(["said", "would", "could", "didnt", "like", "know", "one", "get", "dont"])
new_stop_words

{'could', 'didnt', 'dont', 'get', 'know', 'like', 'one', 'said', 'would'}

In [40]:
# Recreating the mb_chapter_texts dictionary

mb_new_chapter_texts = {}

for filename in os.listdir(mb_dir):
    filepath =  os.path.join(mb_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        mb_new_chapter_texts[chapter_order(filename)] = [word for word in chapter_text.split() if word not in new_stop_words]

In [41]:
# Recreating the ir_chapter_texts dictionary

ir_new_chapter_texts = {}

for filename in os.listdir(ir_dir):
    filepath =  os.path.join(ir_dir, filename)
    with open(filepath, 'r', encoding='utf-8') as chapter_file:
        chapter_text = chapter_file.read()
        ir_new_chapter_texts[chapter_order(filename)] = [word for word in chapter_text.split() if word not in new_stop_words]

#### Running the model again for Topic detection

In [42]:
mb_new_corpus_dict = corpora.Dictionary([values for key, values in mb_new_chapter_texts.items()])
mb_new_corpus = [mb_new_corpus_dict.doc2bow(values) for key, values in mb_new_chapter_texts.items()]

ir_new_corpus_dict = corpora.Dictionary([values for key, values in ir_new_chapter_texts.items()])
ir_new_corpus = [ir_new_corpus_dict.doc2bow(values) for key, values in ir_new_chapter_texts.items()]

In [43]:
mb_new_lda_model = LdaModel(corpus=mb_new_corpus, id2word=mb_new_corpus_dict, 
                            num_topics=10, random_state=42, passes=15)

mb_new_topics = mb_new_lda_model.print_topics(num_words=6)

for topic in mb_new_topics:
    print(topic)

(0, '0.010*"station" + 0.009*"feed" + 0.007*"humans" + 0.007*"security" + 0.006*"id" + 0.005*"human"')
(1, '0.007*"humans" + 0.006*"mensah" + 0.006*"bot" + 0.006*"shuttle" + 0.006*"feed" + 0.006*"secunit"')
(2, '0.012*"humans" + 0.006*"mensah" + 0.005*"human" + 0.005*"art" + 0.005*"feed" + 0.005*"security"')
(3, '0.010*"amena" + 0.008*"feed" + 0.007*"art" + 0.007*"eletra" + 0.005*"ras" + 0.005*"humans"')
(4, '0.011*"art" + 0.010*"humans" + 0.006*"ratthi" + 0.005*"feed" + 0.005*"secunit" + 0.005*"right"')
(5, '0.010*"mensah" + 0.008*"feed" + 0.006*"back" + 0.006*"habitat" + 0.005*"us" + 0.005*"ratthi"')
(6, '0.000*"humans" + 0.000*"feed" + 0.000*"going" + 0.000*"us" + 0.000*"mensah" + 0.000*"back"')
(7, '0.019*"miki" + 0.015*"abene" + 0.011*"feed" + 0.009*"wilken" + 0.006*"shuttle" + 0.006*"gerth"')
(8, '0.007*"us" + 0.007*"feed" + 0.006*"humans" + 0.005*"drones" + 0.005*"target" + 0.005*"two"')
(9, '0.007*"humans" + 0.007*"gurathin" + 0.006*"security" + 0.006*"feed" + 0.006*"ratthi" + 

In [44]:
ir_new_lda_model = LdaModel(corpus=ir_new_corpus, id2word=ir_new_corpus_dict, num_topics=10, random_state=42, passes=15)

ir_new_topics = ir_new_lda_model.print_topics(num_words=6)

for topic in ir_new_topics:
    print(topic)

(0, '0.011*"lieutenant" + 0.011*"station" + 0.011*"captain" + 0.008*"sir" + 0.005*"ship" + 0.005*"fleet"')
(1, '0.013*"station" + 0.011*"seivarden" + 0.005*"translator" + 0.004*"captain" + 0.004*"governor" + 0.004*"administrator"')
(2, '0.006*"mercy" + 0.006*"anaander" + 0.006*"captain" + 0.005*"kalr" + 0.005*"seivarden" + 0.005*"gun"')
(3, '0.000*"station" + 0.000*"lieutenant" + 0.000*"captain" + 0.000*"still" + 0.000*"fleet" + 0.000*"seivarden"')
(4, '0.014*"lieutenant" + 0.008*"station" + 0.007*"captain" + 0.006*"sword" + 0.006*"tisarwat" + 0.006*"still"')
(5, '0.011*"seivarden" + 0.007*"captain" + 0.006*"still" + 0.006*"ship" + 0.005*"tea" + 0.005*"even"')
(6, '0.024*"lieutenant" + 0.019*"awn" + 0.018*"mianaai" + 0.013*"anaander" + 0.011*"lord" + 0.007*"radch"')
(7, '0.000*"lieutenant" + 0.000*"seivarden" + 0.000*"captain" + 0.000*"even" + 0.000*"station" + 0.000*"ship"')
(8, '0.000*"lieutenant" + 0.000*"captain" + 0.000*"station" + 0.000*"even" + 0.000*"seivarden" + 0.000*"ship"')

further removing "didn't" , "like", "know", "one", "don't". i don't know if "one" is relevant in the IR series but since its there frequently in both series, i'm assuming its not and removing from both.

Okay, now it looks like we've taken out all the relevant topics/ characters. atleast i can attest that for the MB series.

#### For series wise TF-IDF 

In [46]:
mb_new_vectorizer = TfidfVectorizer(stop_words='english')

mb_new_tfidf_matrix = mb_new_vectorizer.fit_transform([" ".join(values) for key, values in mb_new_chapter_texts.items()])


mb_new_feature_names = mb_new_vectorizer.get_feature_names_out()

mb_new_tfidf_sums = mb_new_tfidf_matrix.sum(axis=0)
mb_new_sorted_indices = mb_new_tfidf_sums.argsort()[0, ::-1]

mb_new_top_words = [mb_new_feature_names[i] for i in mb_new_sorted_indices[:10]]
for word in mb_new_top_words[0][0][:10]:
    print(word)

humans
feed
art
ratthi
mensah
human
amena
id
station
iris


In [47]:
ir_new_vectorizer = TfidfVectorizer(stop_words='english')

ir_new_tfidf_matrix = ir_new_vectorizer.fit_transform([" ".join(values) for key, values in ir_new_chapter_texts.items()])


ir_new_feature_names = ir_new_vectorizer.get_feature_names_out()

ir_new_tfidf_sums = ir_new_tfidf_matrix.sum(axis=0)
ir_new_sorted_indices = ir_new_tfidf_sums.argsort()[0, ::-1]

ir_new_top_words = [ir_new_feature_names[i] for i in ir_new_sorted_indices[:10]]
for word in ir_new_top_words[0][0][:10]:
    print(word)

seivarden
lieutenant
captain
station
tisarwat
fleet
ship
sir
anaander
translator


This didn't change much.