## Corpus Analysis

In this notebook we will analyse the cleaned texts of the utopia and dystopia corpora.

In [1]:
import uv



## Embedding Analysis

In [4]:
from gensim.models import Word2Vec

In [2]:
all_texts = uv.get_all_texts(uv.get_lemmatized_eutopia_text_dicts())

note: does not include scanned pdfs


In [16]:
print(len(all_texts))
n_words = len([word for text in all_texts for word in text])
print(n_words)
print(n_words/len(all_texts))

721
21626264
29994.818307905687


In [7]:
w2vmodel_utopia = Word2Vec(
        all_texts,
        size=100,
        window=10)

In [8]:
for word in uv.lemmatize_search_words():
    print(word)
    print(w2vmodel_utopia.wv.most_similar(word))

justice
[('uphold', 0.789935827255249), ('conscience', 0.7629621028900146), ('unjust', 0.7477636933326721), ('equity', 0.7415547966957092), ('liberty', 0.7372094392776489), ('violation', 0.7356650233268738), ('enforce', 0.7297465801239014), ('obedience', 0.727350652217865), ('violate', 0.720452070236206), ('pledge', 0.7133994102478027)]
man
[('fellow', 0.5468976497650146), ('wise', 0.5112128853797913), ('smind', 0.5058407783508301), ('woman', 0.4831593930721283), ('honest', 0.4708898067474365), ('ignorant', 0.45521101355552673), ('coward', 0.44704219698905945), ('other-', 0.44057124853134155), ('brute', 0.43696117401123047), ('superior', 0.43116819858551025)]
woman
[('sex', 0.6113781929016113), ('female', 0.5849823951721191), ('wife', 0.5642853379249573), ('elderly', 0.5607531070709229), ('male', 0.552223265171051), ('war_club', 0.5465771555900574), ('unmarried', 0.541377067565918), ('marry', 0.5400214791297913), ('young', 0.538881242275238), ('aged', 0.5342499613761902)]
artificial_in

[('datum', 0.9105061888694763), ('monitor', 0.8812961578369141), ('coordinate', 0.881293535232544), ('electronic', 0.8799422383308411), ('programming', 0.878645658493042), ('tape', 0.8716105222702026), ('data', 0.8612123727798462), ('video', 0.8597277402877808), ('recording', 0.8580148220062256), ('random', 0.8571476936340332)]
digital
[('independently', 0.9084386229515076), ('ocr', 0.8910764455795288), ('google', 0.8498484492301941), ('watermark', 0.8362740874290466), ('pageturner', 0.7966036796569824), ('upload', 0.7956128716468811), ('redistribute', 0.7943112850189209), ('tm', 0.7874166965484619), ('set_forth', 0.7808956503868103), ('wafer', 0.7767702341079712)]
industrial
[('competitive', 0.8487110733985901), ('capitalism', 0.8127597570419312), ('industry', 0.8106957674026489), ('organise', 0.7903134822845459), ('commercial', 0.7829523086547852), ('dustrial', 0.780693531036377), ('eco', 0.7734135985374451), ('economic', 0.7730454206466675), ('economy', 0.7697626352310181), ('organi

## Visualisation Plans
Create embeddings for each book using doc2vec.

tSNE plot of books, color by 

- year, 
- author demography, 
- or even just utopia/dystopia


Visualise certain words in embedding space vs normal embedding space



In [24]:
import gensim

In [28]:
taggedDocs = []

In [29]:
i = 0

In [30]:
for text in all_cleaned_texts_utopia + all_cleaned_texts_dystopia:
    taggedDocs.append(gensim.models.doc2vec.TaggedDocument(words = text, tags = [i]))
    i +=1

In [26]:
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [31]:
doc2vec_model.build_vocab(taggedDocs)

In [32]:
doc2vec_model.train(taggedDocs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [33]:
utopia_vecs = {}

In [34]:
for text in cleaned_texts_pdf:
    utopia_vecs[text] = [doc2vec_model.infer_vector(cleaned_texts_pdf[text])]

In [35]:
for text in cleaned_texts_epub_txt:
    utopia_vecs[text] = [doc2vec_model.infer_vector(cleaned_texts_epub_txt[text])]

In [36]:
dystopia_vecs = {}

In [37]:
for text in cleaned_texts_dystopia:
    dystopia_vecs[text] = [doc2vec_model.infer_vector(cleaned_texts_dystopia[text])]

In [41]:
import numpy as np

In [42]:
utopia_labels = np.zeros(len(utopia_vecs))

In [46]:
len(utopia_vecs)

725

In [47]:
len(dystopia_vecs)

67

In [43]:
dystopia_labels = np.ones(len(dystopia_vecs))

In [44]:
from yellowbrick.text import TSNEVisualizer, UMAPVisualizer

In [45]:
umap = UMAPVisualizer()

In [None]:
umap.fit(utopia_vecs, labels[0:1000])
umap.show()

## Linguistic Counts and Measures

In [None]:
def text_summaries(cleaned_texts, num_topics=5):
    
    word_use = {}
    for text in cleaned_texts:
        for word in text:
            if word not in word_use:
                word_use[word] = 0
            if word in word_use:
                word_use[word] += 1
    
    total_words = sum(word_use.values())
    
    sorted_words = sorted(word_use.items(), key=operator.itemgetter(1))
    sorted_words.reverse()

    dictionary = Dictionary(cleaned_texts)
    corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
    ldamodel = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=10, iterations=500)
    
    tf_idf_model = TfidfModel(corpus)
    tf_idf_texts = tf_idf_model[corpus]
    
    d = {dictionary.get(id): value for doc in tf_idf_texts for id, value in doc}
    sorted_d = sorted(d.items(), key=operator.itemgetter(1))
    sorted_d.reverse()
    
    print("Total number of documents: " + str(total_docs))
    print("Average length of text: " + str(avg_len))
    print(" ")
    print("The top 5 tf-idf scores:")
    print(sorted_d[0:5])
    print(" ")
    print("The top 20 most used words:")
    print(sorted_words[0:20])
    print(" ")
    print("Topic Model with 5 topics")
    print(ldamodel.print_topics())
    return cleaned_texts, ldamodel

TODO:

- compare bootstrapped model (utopia) vs google news model
- plots by decade, plots by demography (gender, race, location, sexuality)
- time periods vs usage of certain concepts, such as nature, climate
- formation of labour/labor before and after 1970s
- lemmatization
- microgenres by time period
- words and markers of genre
- elements across time

Research questions: Are utopian and dystopian fiction genres of infrastructure? How? Is this more the case for utopia than dystopia? Literature of boredom (infrastructure) versus romance?