analysing the vocabulary of texts at individual word level. see below for word frequency clouds (and numerical counts), TF-IDF scores, and bigrams :)))

code references:
 - https://earlyprint.org/jupyterbook/tf_idf.html 
 - https://www.machinelearningplus.com/nlp/gensim-tutorial/#10howtocreatebigramsandtrigramsusingphrasermodels 
 - https://www.markhneedham.com/blog/2015/02/12/pythongensim-creating-bigrams-over-how-i-met-your-mother-transcripts/
 - https://towardsdatascience.com/generate-meaningful-word-clouds-in-python-5b85f5668eeb

In [19]:
# importing required things

from collections import Counter, defaultdict
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from nltk import ngrams, BigramCollocationFinder
from gensim.models import Phrases, phrases

def gettexts(folder):
    texts = []
    #list of lists of strings, each text broken up into individual token strings
    tokenized = []
    textnames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        data = f.readlines()[0]
        texts.append(data)
        name = file.split('.')[0]
        textnames.append(name)
        f.close()
    for text in texts:
        #tokenize by white space
        words = text.strip().split(' ')
        tokenized.append(words)
    return [tokenized, texts, textnames]
#i.e. index 0 gives list of tokens, 1 gives list of texts as one string, 2 gives list textnames

Wordclouds generated through term frequency

In [None]:
#term frequency & word clouds through wordcloud processing

wcdata = gettexts('/srv/data/EPTuningReplaced')

wctokens = wcdata[0]
wctexts = wcdata[1]
wcnames = wcdata[2]
fileTF = "A04813"

In [None]:
#use this for a single text 
# textstring = ' '.join(wctexts[wcnames.index(fileTF)]).lower()
#use this for a collection of texts

wholecorpusstring = ' '.join(wctexts)

# parameters to play with: min_word_length, collocations, collocation_threshold, stopwards

#single text
# wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, min_word_length=3).generate(textstring)
#corpus
wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, collocation_threshold=20, min_word_length=4).generate(wholecorpusstring)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#single text
# textdict = wordcloud.process_text(textstring)
#corpus
textdict = wordcloud.process_text(wholecorpusstring)

wordfreq={k: v for k, v in sorted(textdict.items(),reverse=True, key=lambda item: item[1])}
relfreq=wordcloud.words_

# not using this, doesn't print nicely
# N=40
# print("word frequencies:", list(wordfreq.items())[:N])
# print("relative frequencies:", list(relfreq.items())[:N])
        

In [None]:
# outputting the numbers for frequencies

# combining word frequencies and relative frequencies into one dictionary for cleaner printing
result = defaultdict(list)
for freq in (wordfreq, relfreq):
    for key, value in freq.items():
        result[key].append(value)
headers = ('absolute frequency', 'relative frequency')

print(pd.DataFrame((result.values()), result.keys(), headers).head(n=20))


In [None]:
# setting up manual term frequency

count = CountVectorizer(ngram_range=(1,3))
X = count.fit_transform(wctexts)
X = X.toarray()
dataframe = pd.DataFrame(X, index =[name for name in wcnames], columns=count.get_feature_names_out())


In [None]:
#word cloud generation through term freqs above

topstrings = dataframe.loc[fileTF].sort_values(ascending=False)[:4000]
textdict = dataframe.loc[fileTF].sort_values(ascending=False).to_dict()

wordcloud2 = WordCloud(min_word_length = 3)
wordcloud2.generate_from_frequencies(textdict)

plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis('off')
plt.show()

TF-IDF analysis: looking at a matrix to compare all texts, extracting TF-IDF scores of a single text, and generating wordclouds

In [None]:
tfidfdata = gettexts('/srv/data/EPTuningReplaced')

tfidftokens = tfidfdata[0]
tfidftexts = tfidfdata[1]
tfidfnames = tfidfdata[2]

#setting a text to sort by for TF-IDF analysis
basetext = 'A01092'

#load wordcounts onto dataframe
wordcounts = [Counter(t) for t in tfidftokens]
df = pd.DataFrame(wordcounts, index=[name for name in tfidfnames]).fillna(0)

In [None]:
# using transformer, generate table to compare tf-idfs across multiple texts

# normalization turned off
# sublinear term frequency scaling turned on (takes log of term frequencies and can help to de-emphasize function words like pronouns and articles)
tfidf = TfidfTransformer(norm=None, sublinear_tf=True)
results = tfidf.fit_transform(df)

table = pd.DataFrame(results.toarray(), index=df.index, columns=df.columns)

# columns are texts, using .head(25) to show top 25 terms
# sort using words with highest tfidf scores in specified basetext as an example
table.T.sort_values(by=[basetext], ascending=False).head(25)

In [None]:
# transformer version, but outputting tf-idf values for a single text, easier viewing

transformer = TfidfTransformer(norm=None, sublinear_tf=True, use_idf=True)
cv = CountVectorizer()
wc = cv.fit_transform(tfidftexts)
wctrans = transformer.fit_transform(wc)

single = pd.DataFrame(wctrans[tfidfnames.index(basetext)].T.todense(), index=cv.get_feature_names_out(), columns=[basetext + " TF-IDF"])
single = single.sort_values(basetext + ' TF-IDF', ascending=False)

print (single.head(25))

In [None]:
# tf-idf wordclouds - cannot just use wordcloud processing (rip)

tfidfcloud = WordCloud(min_word_length = 3)
tfidfcloud.generate_from_frequencies(single.to_dict()[basetext + ' TF-IDF'])

plt.imshow(tfidfcloud, interpolation='bilinear')
plt.axis('off')
plt.show()



bigram generation: denoting training/testing corpus and generating common bigrams sorted by descending frequency

In [14]:
bigramdata = gettexts('/srv/data/targetCorpusNOSTOP')

bigramtokens = bigramdata[0]
bigramtexts = bigramdata[1]
bigramnames = bigramdata[2]

In [15]:
#splitting texts by index

#given in list of list of strings
training = []
#given list of strings
testing = []
#testing = [word for word in gettexts('/srv/data/targetCorpusSTOP')[1]]
for t in bigramtokens:
    if bigramtokens.index(t)%2==0:
        training.append(t)
    else: 
        for word in t:
            testing.append(word)

In [20]:
#generating bigrams, can take a bit long lmao

# training bigram model: parameters incl min count, threshold (from -1 to 1), scoring (npmi = more robust?)
bigrammodel = Phrases(training, min_count = 3, threshold=-0.5, scoring='npmi', connector_words=phrases.ENGLISH_CONNECTOR_WORDS)

# getting the frequency(?) of bigrams within test
bgcount = Counter(b for b in bigrammodel[testing] if len(b.split("_")) > 1 )

# printing top 20 most common bigrams
print(pd.DataFrame(dict(bgcount).values(), index=dict(bgcount).keys(), columns=['bigram frequency']).sort_values('bigram frequency', ascending=False).head(n=20))

            bigram frequency
they_have              16956
that_they              16174
say_that               10826
that_have               9903
will_not                9714
which_have              9100
which_they              8920
can_not                 8093
his_own                 7857
that_which              7509
have_not                7147
they_shall              6992
who_have                6817
they_will               6587
if_they                 6351
not_only                6167
when_they               6073
they_that               5962
they_may                5653
will_have               5600


In [23]:
# looking for specific bigrams based on a word of interest
searchword = 'health'

searchbigrams = {}
for key in dict(bgcount).keys():
    if key.split('_')[0] == searchword or key.split('_')[-1] == searchword:
        #print (key, dict(bgcount)[key])
        searchbigrams[key] = dict(bgcount)[key]

#nice printing, ordered by frequency
print(pd.DataFrame(searchbigrams.values(), index=searchbigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))

                     frequency
his_health                  93
perfect_health              66
health_but                  54
good_health                 53
their_health                50
soul_health                 42
health_his                  32
her_health                  27
former_health               27
health_that                 26
health_their                25
health_which                22
health_the_body             21
health_they                 20
restore_health              20
our_health                  20
your_health                 17
recover_health              17
health_and_strength         17
life_and_health             16


In [18]:
# printing out context windows for a given bigram, double check which dataset used for bigram generation (nostop or stop)

# add spaces before and after bigram if you are looking for two very specific words, e.g. "angel men" and not "angel mentions"
searchbigram = 'unjust sale'

# accounting for flipped instances of bigrams
flipsearch = searchbigram.spli

names = []
# for root, dirs, files in os.walk(dir):
#         for file in files:
#             if file.endswith('.txt'):            
#                 name = file.split('.')[0]
#                 path = os.path.join(root,file)
#                 f = open(path,'r')
#                 text = f.readlines()[0]
#                 f.close()
for text in bigramtexts:
    if (searchbigram in text) or (flipsearch in text):
        name = bigramnames[bigramtexts.index(text)]
        names.append(name)
        indices = [i for i in range(len(text)) if text.startswith(searchbigram, i)]
        for index in indices:
            print(name+':', text[(index-100):(index+100)])        

print(names)           

A02495: e admiral come and remember the very first shoot the discharge shoot little above the belly whereby make unserviceable for good while after without touch any other for that night yet mean honest true 
A03149: towards the end the reign stephen they lay level with the ground and the few which remain dismantle make unserviceable this care take disable the lord commons home but for keep the seacoast from forei
A07834: land extreme weather and winter camp where they have mean refresh they begin die and will have lose make unserviceable if this course have not take hearten they this day and for many day after diverse
A10357: ad the footman enforce lift their bow and arrow and dart over their head keep they from moisten and make unserviceable the water but true and understand say homer the mind man ever affect god will the
A10357: pon any angle shall force they give ground and fall back upon their next fellow which many entangle make unserviceable lose force they they may easy because