analysing the vocabulary of texts at individual word level. see below for word frequency clouds (and numerical counts), TF-IDF scores, and n-grams (bi and tri-grams) :)))

code references:
 - https://earlyprint.org/jupyterbook/tf_idf.html 
 - https://www.machinelearningplus.com/nlp/gensim-tutorial/#10howtocreatebigramsandtrigramsusingphrasermodels 
 - https://www.markhneedham.com/blog/2015/02/12/pythongensim-creating-bigrams-over-how-i-met-your-mother-transcripts/
 - https://towardsdatascience.com/generate-meaningful-word-clouds-in-python-5b85f5668eeb

In [2]:
#setup

from collections import Counter, defaultdict
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from gensim.models import Phrases, phrases

def gettexts(folder, searchList):
    texts = []
    underscores = {}
    #list of lists of strings, each text broken up into individual token strings
    tokenized = []
    #list of texts as a continuous string
    textnames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        data = f.readlines()[0]
        # accounting for underscores in EP filenames
        if '_' in file: 
            name = file.split('_')[0]
            if name  in searchList: continue
            if name not in underscores.keys(): 
                    underscores[name] = data
            else: underscores[name] = underscores[name] + ' ' + data
        else: 
            name = file.split('.')[0]
            if name  in searchList: continue
            texts.append(data)
            textnames.append(name)
        f.close()
    for name,text in underscores.items():
            texts.append(text)
            textnames.append(name)
    for text in texts:
        #tokenize by white space
        words = text.strip().split(' ')
        tokenized.append(words)
    return [tokenized, texts, textnames]
    #i.e. index 0 gives list of tokens, 1 gives list of texts as one string, 2 gives list textnames

Wordclouds generated through term frequency

In [None]:
#term frequency & word clouds through wordcloud processing
wcdata = gettexts('/srv/data/EPTuningReplaced')
wctokens = wcdata[0]
wctexts = wcdata[1]
wcnames = wcdata[2]
fileTF = "A04813"

In [None]:
#use this for a single text 
# textstring = ' '.join(wctexts[wcnames.index(fileTF)]).lower()
#use this for a collection of texts

wholecorpusstring = ' '.join(wctexts)

# parameters to play with: min_word_length, collocations, collocation_threshold, stopwards

#single text
# wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, min_word_length=3).generate(textstring)
#corpus
wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, collocation_threshold=20, min_word_length=4).generate(wholecorpusstring)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#single text
# textdict = wordcloud.process_text(textstring)
#corpus
textdict = wordcloud.process_text(wholecorpusstring)

wordfreq={k: v for k, v in sorted(textdict.items(),reverse=True, key=lambda item: item[1])}
relfreq=wordcloud.words_

# not using this, doesn't print nicely
# N=40
# print("word frequencies:", list(wordfreq.items())[:N])
# print("relative frequencies:", list(relfreq.items())[:N])        

In [None]:
# outputting the numbers for frequencies

# combining word frequencies and relative frequencies into one dictionary for cleaner printing
result = defaultdict(list)
for freq in (wordfreq, relfreq):
    for key, value in freq.items():
        result[key].append(value)
headers = ('absolute frequency', 'relative frequency')
print(pd.DataFrame((result.values()), result.keys(), headers).head(n=20))

In [None]:
# setting up manual term frequency
count = CountVectorizer(ngram_range=(1,3))
X = count.fit_transform(wctexts)
X = X.toarray()
dataframe = pd.DataFrame(X, index =[name for name in wcnames], columns=count.get_feature_names_out())

In [None]:
#word cloud generation through term freqs above

topstrings = dataframe.loc[fileTF].sort_values(ascending=False)[:4000]
textdict = dataframe.loc[fileTF].sort_values(ascending=False).to_dict()

wordcloud2 = WordCloud(min_word_length = 3)
wordcloud2.generate_from_frequencies(textdict)

plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis('off')
plt.show()

TF-IDF analysis: looking at a matrix to compare all texts, extracting TF-IDF scores of a single text, and generating wordclouds

In [None]:
tfidfdata = gettexts('/srv/data/EPTuningReplaced')
tfidftokens = tfidfdata[0]
tfidftexts = tfidfdata[1]
tfidfnames = tfidfdata[2]

#setting a text to sort by for TF-IDF analysis
basetext = 'A01092'

#load wordcounts onto dataframe
wordcounts = [Counter(t) for t in tfidftokens]
df = pd.DataFrame(wordcounts, index=[name for name in tfidfnames]).fillna(0)

In [None]:
# using transformer, generate table to compare tf-idfs across multiple texts

# normalization turned off
# sublinear term frequency scaling turned on (takes log of term frequencies and can help to de-emphasize function words like pronouns and articles)
tfidf = TfidfTransformer(norm=None, sublinear_tf=True)
results = tfidf.fit_transform(df)
table = pd.DataFrame(results.toarray(), index=df.index, columns=df.columns)

# columns are texts, using .head(25) to show top 25 terms
# sort using words with highest tfidf scores in specified basetext as an example
table.T.sort_values(by=[basetext], ascending=False).head(25)

In [None]:
# transformer version, but outputting tf-idf values for a single text, easier viewing

transformer = TfidfTransformer(norm=None, sublinear_tf=True, use_idf=True)
cv = CountVectorizer()
wc = cv.fit_transform(tfidftexts)
wctrans = transformer.fit_transform(wc)

single = pd.DataFrame(wctrans[tfidfnames.index(basetext)].T.todense(), index=cv.get_feature_names_out(), columns=[basetext + " TF-IDF"])
single = single.sort_values(basetext + ' TF-IDF', ascending=False)

print (single.head(25))

In [None]:
# tf-idf wordclouds - cannot just use wordcloud processing (rip)

tfidfcloud = WordCloud(min_word_length = 3)
tfidfcloud.generate_from_frequencies(single.to_dict()[basetext + ' TF-IDF'])

plt.imshow(tfidfcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

bigram generation: denoting training/testing corpus, generating common bigrams (sorted by descending frequency), searching for bigrams containing key terms of interest, and also generating context windows to clarify specific bigrams.

In [3]:
#change for each time period: index = period - 1
period = open('/srv/data/timeranges.txt', 'r').readlines()[1]
period = period.strip().strip('[').strip(']').replace("'", '').split(', ')
empty = []

bigramdata = gettexts('/srv/data/relevantEPBodyNOSTOP', period)
bigramtokens = bigramdata[0]
bigramtexts = bigramdata[1]
bigramnames = bigramdata[2]

In [None]:
#splitting texts for training/testing by index

#given in list of list of strings
training = []
#given list of strings
testing = []
testlen = 0

#for specific text
# testtext = 'A02495'

for t in bigramtokens:
    #for 50:50 splitting
    # if bigramtokens.index(t)%2==1:
    if bigramtokens.index(t)%2==0:
    
    #for running period-specific training/testing
    # if bigramnames[bigramtokens.index(t)] not in testnames:

    #text-specific
    # if bigramnames[bigramtokens.index(t)] != testtext:
        training.append(t)
    else: 
        testlen += 1
        for word in t:
            testing.append(word)

print(len(training))
print(testlen)

In [None]:
#generating bigrams, can take a bit of time lmao

# training bigram model: parameters incl min count, threshold (from -1 to 1), scoring (npmi = more robust?), 
#  and connector words enabled to allow for longer, informative ngrams (e.g. 'trade and traffic')
bigrammodel = Phrases(training, min_count = 1, threshold=-0.5, scoring='npmi', connector_words=phrases.ENGLISH_CONNECTOR_WORDS)

# getting the frequency(?) of bigrams within testing set
bgcount = Counter(b for b in bigrammodel[testing] if len(b.split("_")) > 1 )

# printing top 20 most common bigrams
print(pd.DataFrame(dict(bgcount).values(), index=dict(bgcount).keys(), columns=['bigram frequency']).sort_values('bigram frequency', ascending=False).head(n=20))

In [None]:
# looking for specific bigrams based on a word of interest
searchword = 'tabacco'

#for outputting to txt file, specify here
# bruh = '/srv/data/joy/'+'/bigrams.txt'
# bgoutfile = open(bruh,'a+')

searchbigrams = {}
for key in dict(bgcount).keys():
    # if key.split('_')[0] == searchword or key.split('_')[-1] == searchword:
    if searchword in key.split('_'):

        #print (key, dict(bgcount)[key])
        
        #printing out to textfile yee
        # bgoutfile.write(key + '\n')
       
        searchbigrams[key] = dict(bgcount)[key]

# bgoutfile.close()

#nice printing, ordered by frequency
print(pd.DataFrame(searchbigrams.values(), index=searchbigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))

In [None]:
# trigrams

trigrammodel = Phrases(bigrammodel[training], min_count = 3, threshold = -0.5, scoring='npmi', connector_words=phrases.ENGLISH_CONNECTOR_WORDS )
tgcount = Counter(t for t in trigrammodel[testing] if len(t.split("_")) > 2 )
print(pd.DataFrame(dict(tgcount).values(), index=dict(tgcount).keys(), columns=['trigram frequency']).sort_values('trigram frequency', ascending=False).head(n=20))


In [None]:
# searching for terms of interest in trigrams

# looking for specific bigrams based on a word of interest
searchword = 'tobacco'

#for outputting to txt file, specify here
scream = '/srv/data/joy/trigrams.txt'
tgoutfile = open(scream,'a+')

searchtrigrams = {}
for key in dict(tgcount).keys():
    if key.split('_')[0] == searchword or key.split('_')[-1] == searchword or key.split('_')[1] == searchword:
        #print (key, dict(bgcount)[key])
        #writing to text file
        tgoutfile.write(key + '\n')
        
        searchtrigrams[key] = dict(tgcount)[key]

#nice printing, ordered by frequency
print(pd.DataFrame(searchtrigrams.values(), index=searchtrigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))

In [7]:
# printing out context windows for a given ngram, double check which dataset used for bigram generation (nostop or stop)
# if you need TCP context windows, use select.py to create a new folder, then grep the specific term

# add spaces before and after bigram if you are looking for two very specific words, e.g. "angel men" and not "angel mentions"
searchgram = 'oil of poppy'

# accounting for flipped instances of bigrams
# flipsearch = searchgram.split(' ')[1]+' '+searchgram.split(' ')[0]

names = []
for text in bigramtexts:
    if (searchgram in text): # or (flipsearch in text):
        name = bigramnames[bigramtexts.index(text)]
        names.append(name)
        indices = [i for i in range(len(text)) if text.startswith(searchgram, i)] #or text.startswith(flipsearch, i)]
        windows = []
        for index in indices:
            if index > 120:
                window = text[(index-120):(index+120)].split(' ')
            if index < 120:
                window = text[0:(index+120)].split(' ')
            del window[0]
            if len(window) > 0:
                del window [-1]
        # flipindices = [i for i in range(len(text)) if text.startswith(flipsearch, i)]
        # for index in flipindices:
        #     print(name+':', text[(index-100):(index+100)]) 
            print(name+':', ' '.join(window ))
print(names, len(names))           

A64906: the back and have prove these oil good the oil of mardine the oil of alabaster and the oil of water lily hot cause the oil of poppy very good cold cause for scurf the body this infirmity come of choleric and melancholy humour for this
A02327: asmuch ounce of turpentine and asmuch rosin oil of olive pound and half pound of oil of bitter almond and asmuch oil of poppy white wax ounce black pitch three ounce melt your gum and heat your oil with the verdegrece and strain
A04936: store the second chapter sleep if the sick can not sleep anoint the fore part of the head with oil of water lily and oil of poppy they you may for need add little opium that sleep thereby may provoke note that box without scarification
A17310: nightshade opium compound liquid syrupe of poppy violet rose solid nicholai romanun laudanum paracelsi outward use oil of poppy violet rose mandrake nutmeg oderament of vinegar rose water opium frontal of rosecake rosevineger nutmeg
A17310: cap mania hildesheim spicel