analysing the vocabulary of texts at individual word level. see below for word frequency clouds (and numerical counts), TF-IDF scores, and bigrams :)))

code references:
 - https://earlyprint.org/jupyterbook/tf_idf.html 
 - https://www.machinelearningplus.com/nlp/gensim-tutorial/#10howtocreatebigramsandtrigramsusingphrasermodels 
 - https://www.markhneedham.com/blog/2015/02/12/pythongensim-creating-bigrams-over-how-i-met-your-mother-transcripts/
 - https://towardsdatascience.com/generate-meaningful-word-clouds-in-python-5b85f5668eeb

In [1]:
# importing required things

from collections import Counter, defaultdict
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from nltk import ngrams, BigramCollocationFinder
from gensim.models import Phrases

In [5]:
#setting up

#getting texts
texts = []
textnames = []
folder = '/srv/data/sermonsOurTimeBody'
for file in os.listdir(folder):
    path = os.path.join(folder,file)
    f = open(path,'r')
    data = f.readlines()[0]
    texts.append(data)
    name = file.split('.')[0]
    textnames.append(name)
    f.close()
 
# list of lists of strings, each text broken up into individual token strings
tokenized = []
for text in texts:
    #tokenize by white space
    words = text.strip().split(' ')
    tokenized.append(words)

Wordclouds generated through term frequency

In [None]:
#term frequency & word clouds through wordcloud processing

fileTF = "A04813"

#use this for a single text 
# textstring = ' '.join(texts[textnames.index(fileTF)]).lower()
#use this for a collection of texts
wholecorpusstring = ' '.join(texts)

# parameters to play with: min_word_length, collocations, collocation_threshold, stopwards

#single text
# wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, min_word_length=3).generate(textstring)
#corpus
wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, collocation_threshold=20, min_word_length=4).generate(wholecorpusstring)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#single text
# textdict = wordcloud.process_text(textstring)
#corpus
textdict = wordcloud.process_text(wholecorpusstring)

wordfreq={k: v for k, v in sorted(textdict.items(),reverse=True, key=lambda item: item[1])}
relfreq=wordcloud.words_

# not using this, doesn't print nicely
# N=40
# print("word frequencies:", list(wordfreq.items())[:N])
# print("relative frequencies:", list(relfreq.items())[:N])
        

In [None]:
# outputting the numbers for frequencies

# combining word frequencies and relative frequencies into one dictionary for cleaner printing
result = defaultdict(list)
for freq in (wordfreq, relfreq):
    for key, value in freq.items():
        result[key].append(value)
headers = ('absolute frequency', 'relative frequency')

print(pd.DataFrame((result.values()), result.keys(), headers).head(n=20))


In [None]:
# setting up manual term frequency

count = CountVectorizer(ngram_range=(1,3))
X = count.fit_transform(texts)
X = X.toarray()
dataframe = pd.DataFrame(X, index =[name for name in textnames], columns=count.get_feature_names_out())


In [None]:
#word cloud generation through term freqs above

topstrings = dataframe.loc[fileTF].sort_values(ascending=False)[:4000]
textdict = dataframe.loc[fileTF].sort_values(ascending=False).to_dict()

wordcloud2 = WordCloud(min_word_length = 3)
wordcloud2.generate_from_frequencies(textdict)

plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis('off')
plt.show()

TF-IDF analysis: looking at a matrix to compare all texts, extracting TF-IDF scores of a single text, and generating wordclouds

In [None]:
#load wordcounts onto dataframe
wordcounts = [Counter(t) for t in tokenized]
df = pd.DataFrame(wordcounts, index=[name for name in textnames]).fillna(0)

#setting a text to sort by for TF-IDF analysis
basetext = 'A01092'

In [None]:
# using transformer, generate table to compare tf-idfs across multiple texts

# normalization turned off
# sublinear term frequency scaling turned on (takes log of term frequencies and can help to de-emphasize function words like pronouns and articles)
tfidf = TfidfTransformer(norm=None, sublinear_tf=True)
results = tfidf.fit_transform(df)

table = pd.DataFrame(results.toarray(), index=df.index, columns=df.columns)

# columns are texts, using .head(25) to show top 25 terms
# sort using words with highest tfidf scores in specified basetext as an example
table.T.sort_values(by=[basetext], ascending=False).head(25)

In [None]:
# transformer version, but outputting tf-idf values for a single text, easier viewing

transformer = TfidfTransformer(norm=None, sublinear_tf=True, use_idf=True)
cv = CountVectorizer()
wc = cv.fit_transform(texts)
wctrans = transformer.fit_transform(wc)

single = pd.DataFrame(wctrans[textnames.index(basetext)].T.todense(), index=cv.get_feature_names_out(), columns=[basetext + " TF-IDF"])
single = single.sort_values(basetext + ' TF-IDF', ascending=False)

print (single.head(25))

In [None]:
# tf-idf wordclouds - cannot just use wordcloud processing (rip)

tfidfcloud = WordCloud(min_word_length = 3)
tfidfcloud.generate_from_frequencies(single.to_dict()[basetext + ' TF-IDF'])

plt.imshow(tfidfcloud, interpolation='bilinear')
plt.axis('off')
plt.show()



bigram generation: denoting training/testing corpus and generating common bigrams sorted by descending frequency

In [8]:
#splitting bigrams through index (every even index goes into training set)

training = []
testing = []
for t in tokenized:
    if tokenized.index(t)%2==0:
        training.append(t)
    else: 
        for word in t:
            testing.append(word)

In [None]:
#generating bigrams

# training bigram model: parameters to play with incl min count, threshold, scoring (npmi = more robust?)
bigrammodel = Phrases(training, min_count = 3, threshold=-0.5, scoring='npmi')

# getting the frequency(?) of bigrams within test
bgcount = Counter(b for b in bigrammodel[testing] if len(b.split("_")) > 1 )

# printing top 20 most common bigrams
print(pd.DataFrame(dict(bgcount).values(), index=dict(bgcount).keys(), columns=['bigram frequency']).sort_values('bigram frequency', ascending=False).head(n=20))

In [None]:
# looking for specific bigrams based on a word of interest
searchword = 'men'

searchbigrams = {}
for key in dict(bgcount).keys():
    if key.split('_')[0] == searchword or key.split('_')[-1] == searchword:
        print (key, dict(bgcount)[key])
        #searchbigrams[key] = dict(bgcount)[key]

#nice printing, ordered by frequency
#print(pd.DataFrame(searchbigrams.values(), index=searchbigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))