analysing the vocabulary of texts at individual word level. see below for word frequency clouds (and numerical counts), TF-IDF scores, and n-grams (bi and tri-grams) :)))

code references:
 - https://earlyprint.org/jupyterbook/tf_idf.html 
 - https://www.machinelearningplus.com/nlp/gensim-tutorial/#10howtocreatebigramsandtrigramsusingphrasermodels 
 - https://www.markhneedham.com/blog/2015/02/12/pythongensim-creating-bigrams-over-how-i-met-your-mother-transcripts/
 - https://towardsdatascience.com/generate-meaningful-word-clouds-in-python-5b85f5668eeb

In [3]:
# importing required things

from collections import Counter, defaultdict
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from nltk import ngrams, BigramCollocationFinder
from gensim.models import Phrases, phrases

def gettexts(folder):
    texts = []
    #list of lists of strings, each text broken up into individual token strings
    tokenized = []
    textnames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        data = f.readlines()[0]
        texts.append(data)
        name = file.split('.')[0]
        textnames.append(name)
        f.close()
    for text in texts:
        #tokenize by white space
        words = text.strip().split(' ')
        tokenized.append(words)
    return [tokenized, texts, textnames]
    #i.e. index 0 gives list of tokens, 1 gives list of texts as one string, 2 gives list textnames

Wordclouds generated through term frequency

In [None]:
#term frequency & word clouds through wordcloud processing

wcdata = gettexts('/srv/data/EPTuningReplaced')

wctokens = wcdata[0]
wctexts = wcdata[1]
wcnames = wcdata[2]
fileTF = "A04813"

In [None]:
#use this for a single text 
# textstring = ' '.join(wctexts[wcnames.index(fileTF)]).lower()
#use this for a collection of texts

wholecorpusstring = ' '.join(wctexts)

# parameters to play with: min_word_length, collocations, collocation_threshold, stopwards

#single text
# wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, min_word_length=3).generate(textstring)
#corpus
wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True, collocation_threshold=20, min_word_length=4).generate(wholecorpusstring)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

#single text
# textdict = wordcloud.process_text(textstring)
#corpus
textdict = wordcloud.process_text(wholecorpusstring)

wordfreq={k: v for k, v in sorted(textdict.items(),reverse=True, key=lambda item: item[1])}
relfreq=wordcloud.words_

# not using this, doesn't print nicely
# N=40
# print("word frequencies:", list(wordfreq.items())[:N])
# print("relative frequencies:", list(relfreq.items())[:N])
        

In [None]:
# outputting the numbers for frequencies

# combining word frequencies and relative frequencies into one dictionary for cleaner printing
result = defaultdict(list)
for freq in (wordfreq, relfreq):
    for key, value in freq.items():
        result[key].append(value)
headers = ('absolute frequency', 'relative frequency')

print(pd.DataFrame((result.values()), result.keys(), headers).head(n=20))


In [None]:
# setting up manual term frequency

count = CountVectorizer(ngram_range=(1,3))
X = count.fit_transform(wctexts)
X = X.toarray()
dataframe = pd.DataFrame(X, index =[name for name in wcnames], columns=count.get_feature_names_out())


In [None]:
#word cloud generation through term freqs above

topstrings = dataframe.loc[fileTF].sort_values(ascending=False)[:4000]
textdict = dataframe.loc[fileTF].sort_values(ascending=False).to_dict()

wordcloud2 = WordCloud(min_word_length = 3)
wordcloud2.generate_from_frequencies(textdict)

plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis('off')
plt.show()

TF-IDF analysis: looking at a matrix to compare all texts, extracting TF-IDF scores of a single text, and generating wordclouds

In [4]:
tfidfdata = gettexts('/srv/data/EPTuningReplaced')

tfidftokens = tfidfdata[0]
tfidftexts = tfidfdata[1]
tfidfnames = tfidfdata[2]

#setting a text to sort by for TF-IDF analysis
basetext = 'A01092'

#load wordcounts onto dataframe
wordcounts = [Counter(t) for t in tfidftokens]
df = pd.DataFrame(wordcounts, index=[name for name in tfidfnames]).fillna(0)

In [None]:
# using transformer, generate table to compare tf-idfs across multiple texts

# normalization turned off
# sublinear term frequency scaling turned on (takes log of term frequencies and can help to de-emphasize function words like pronouns and articles)
tfidf = TfidfTransformer(norm=None, sublinear_tf=True)
results = tfidf.fit_transform(df)

table = pd.DataFrame(results.toarray(), index=df.index, columns=df.columns)

# columns are texts, using .head(25) to show top 25 terms
# sort using words with highest tfidf scores in specified basetext as an example
table.T.sort_values(by=[basetext], ascending=False).head(25)

In [None]:
# transformer version, but outputting tf-idf values for a single text, easier viewing

transformer = TfidfTransformer(norm=None, sublinear_tf=True, use_idf=True)
cv = CountVectorizer()
wc = cv.fit_transform(tfidftexts)
wctrans = transformer.fit_transform(wc)

single = pd.DataFrame(wctrans[tfidfnames.index(basetext)].T.todense(), index=cv.get_feature_names_out(), columns=[basetext + " TF-IDF"])
single = single.sort_values(basetext + ' TF-IDF', ascending=False)

print (single.head(25))

In [None]:
# tf-idf wordclouds - cannot just use wordcloud processing (rip)

tfidfcloud = WordCloud(min_word_length = 3)
tfidfcloud.generate_from_frequencies(single.to_dict()[basetext + ' TF-IDF'])

plt.imshow(tfidfcloud, interpolation='bilinear')
plt.axis('off')
plt.show()



bigram generation: denoting training/testing corpus and generating common bigrams sorted by descending frequency

In [6]:
bigramdata = gettexts('/srv/data/targetCorpusNOSTOP')

bigramtokens = bigramdata[0]
bigramtexts = bigramdata[1]
bigramnames = bigramdata[2]

In [None]:
testnames = ['A01567', 'A03862', 'A09179', 'A00924', 'A09213', 'A08548', 'A00472', 'A04463', 'A00831', 'A00933', 'A05123', 'A02731', 'A08426', 'A02129', 'A02778', 'A01299', 'A04936', 'A01309', 'A19770', 'A13091', 'A10352', 'A17698', 'A19994', 'A15042', 'A18320', 'A10574', 'A19533', 'A18098', 'A18319', 'A19743', 'A13086', 'A12533', 'A11561', 'A20370', 'A67927', 'A68845', 'A67922', 'A69118', 'A67926', 'B00136', 'A06986', 'A00253', 'A08599', 'A00482', 'A00282', 'A07033', 'A07415', 'A03125', 'A01568', 'A08200', 'A08816', 'A01712', 'A07032', 'A17891', 'A18162', 'A17643', 'A10112', 'A13877', 'A19422', 'A19849', 'A13407', 'A15706', 'A13837', 'A17037', 'A11255', 'A14350', 'A68314', 'A68438', 'A68798', 'A72378', 'B00420', 'B01022', 'B07173', 'B07675', 'A06400', 'A08281', 'A09513', 'A09757', 'A06629', 'A07432', 'A04556', 'A07828', 'A02777', 'A07447', 'A07482', 'A06202', 'A01683', 'A02182', 'A07523', 'A04243', 'A02402', 'A06339', 'A04334', 'A06157', 'A09044', 'A16976', 'A14783', 'A18926', 'A11225', 'A11237', 'A16730', 'A16954', 'A16813', 'A12001', 'A16956', 'A16729', 'A13798', 'A19790', 'A19745', 'A16804', 'A11994', 'A16193', 'A12010', 'A14028', 'A13819', 'A20689', 'A22560', 'A20814', 'A20888', 'A21132', 'A69185', 'A69227', 'A02319', 'A07742', 'A06216', 'A07101', 'A09675', 'A09632', 'A03763', 'A09677', 'A01711', 'A01739', 'A07683', 'A05054', 'A06118', 'A09176', 'A01450', 'A08137', 'A05236', 'A03441', 'A06108', 'A05220', 'A04973', 'A07278', 'A04506', 'A09949', 'A01165', 'A08087', 'A09575', 'A09579', 'A07104', 'A06891', 'A12703', 'A17069', 'A18015', 'A12676', 'A13318', 'A16381', 'A13314', 'A15346', 'A18327', 'A18909', 'A22696', 'A69045', 'A73589', 'B08106', 'B07183', 'B12583', 'A07380', 'A03925', 'A02364', 'A06447', 'A01955', 'A08550', 'A07859', 'A09604', 'A07648', 'A03032', 'A05562', 'A02237', 'A02796', 'A05458', 'A07078', 'A08108', 'A03513', 'A08307', 'A02604', 'A05569', 'A09528', 'A06448', 'A07858', 'A02168', 'A03109', 'A06993', 'A09453', 'A06431', 'A02181', 'A00991', 'A00566', 'A06684', 'A07075', 'A04863', 'A18924', 'A10809', 'A13070', 'A16636', 'A16974', 'A13797', 'A18261', 'A11966', 'A18402', 'A14160', 'A11096', 'A10956', 'A11952', 'A11826', 'A17324', 'A17048', 'A11081', 'A17054', 'A13049', 'A16304', 'A17957', 'A13370', 'A15427', 'A10418', 'A13333', 'A20408', 'A21003', 'A68726', 'A69175', 'A07650', 'A01160', 'A00206', 'A08870', 'A01725', 'A08106', 'A04028', 'A04827', 'A08067', 'A05051', 'A03764', 'A00309', 'A08551', 'A08107', 'A14806', 'A13025', 'A15336', 'A10710', 'A14109', 'A19746', 'A18588', 'A16792', 'A14936', 'A18965', 'A14963', 'A13794', 'A17812', 'A11416', 'A16164', 'A11048', 'A10861', 'A10310', 'A16170', 'A68463', 'A68091', 'A72844', 'B07996', 'B07975', 'B07157', 'A09233', 'A08059', 'A05036', 'A02655', 'A02167', 'A03468', 'A02995', 'A09086', 'A00748', 'A00627', 'A04372', 'A02110', 'A06137', 'A02983', 'A07166', 'A09466', 'A06800', 'A01730', 'A05037', 'A02783', 'A00702', 'A00641', 'A01736', 'A02111', 'A09137', 'A02973', 'A06134', 'A09382', 'A07004', 'A16964', 'A14275', 'A11862', 'A15292', 'A16987', 'A16196', 'A15980', 'A10968', 'A13085', 'A13320', 'A19863', 'A17331', 'A10767', 'A16509', 'A10803', 'A12567', 'A19934', 'A11493', 'A16643', 'A15760', 'A13377', 'A12399', 'A19864', 'A14823', 'A14807', 'A15517', 'A15466', 'A14850', 'A12376', 'A19895', 'A18993', 'A14825', 'A12777', 'A12229', 'A19440', 'A20438', 'A20435', 'A20909', 'A68146', 'A68340', 'A04966', 'A00009', 'A09299', 'A01678', 'A01154', 'A08134', 'A09211', 'A03581', 'A08919', 'A02993', 'A06135', 'A02972', 'A00553', 'A08058', 'A07722', 'A02672', 'A01151', 'A03571', 'A03089', 'A01942', 'A07876', 'A19218', 'A10299', 'A10295', 'A11026', 'A17372', 'A14164', 'A12988', 'A14255', 'A19800', 'A19056', 'A13341', 'A18521', 'A16443', 'A17918', 'A10609', 'A19901', 'A14254', 'A18995', 'A11218', 'A14281', 'A20037', 'A22030', 'A68221', 'A68566', 'A68254', 'B00367', 'B07409', 'B00728', 'B12393', 'A04942', 'A01228', 'A09088', 'A05576', 'A09610', 'A02079', 'A01162', 'A02792', 'A00635', 'A04564', 'A06621', 'A02092', 'A08006', 'A00800', 'A09362', 'A06619', 'A00477', 'A02159', 'A02166', 'A06165', 'A02121', 'A00734', 'A08011', 'A00710', 'A02428', 'A03345', 'A02140', 'A00730', 'A02103', 'A02160', 'A06767', 'A06173', 'A02154', 'A05277', 'A18740', 'A19241', 'A12601', 'A18733', 'A15422', 'A16102', 'A11842', 'A13375', 'A12373', 'A12577', 'A13576', 'A15801', 'A19223', 'A15519', 'A19165', 'A17322', 'A19862', 'A12381', 'A16955', 'A16778', 'A13094', 'A10697', 'A12578', 'A13700', 'A14585', 'A13104', 'A20596', 'A20951', 'A68693', 'A68113', 'A68139', 'A68661', 'A68747', 'A68000', 'A68409', 'A05311', 'A03576', 'A00185', 'A02991', 'A03481', 'A08940', 'A03006', 'A06855', 'A01157', 'A08013', 'A05363', 'A01147', 'A01171', 'A07874', 'A06881', 'A08105', 'A17696', 'A13363', 'A13156', 'A12949', 'A14108', 'A13578', 'A19395', 'A20105', 'A22294', 'A69066', 'A68347', 'A68144', 'A68165', 'A73175', 'A72503', 'A72085', 'B07987', 'A02997', 'A06185', 'A09221', 'A04673', 'A05335', 'A09581', 'A01101', 'A04989', 'A04736', 'A07912', 'A09307', 'A07003', 'A05583', 'A03472', 'A08014', 'A03771', 'A06083', 'A07023', 'A03833', 'A01500', 'A06184', 'A03828', 'A04888', 'A02975', 'A08015', 'A07018', 'A08308', 'A02127', 'A02044', 'A02623', 'A06622', 'A01738', 'A02125', 'A08434', 'A05099', 'A17002', 'A19816', 'A12017', 'A18422', 'A16221', 'A10612', 'A13386', 'A10730', 'A18994', 'A12035', 'A17321', 'A15487', 'A14821', 'A15515', 'A18948', 'A19738', 'A12389', 'A13358', 'A16975', 'A17326', 'A12568', 'A20824', 'A20826', 'A20834', 'A20406', 'A20686', 'A72913', 'A06512', 'A00410', 'A08788', 'A02256', 'A05075', 'A05404', 'A01682', 'A08085', 'A02992', 'A03123', 'A13062', 'A19447', 'A12705', 'A13756', 'A14282', 'A19799', 'A14450', 'A18690', 'A13276', 'A22690', 'A68924', 'A72769', 'A73201', 'A73880', 'B00466', 'B01207', 'B13519', 'A04483', 'A08620', 'A09649', 'A07892', 'A00703', 'A05237', 'A09100', 'A08610', 'A01951', 'A01727', 'A05297', 'A07881', 'A00940', 'A07901', 'A07891', 'A06476', 'A09062', 'A07899', 'A06775', 'A01196', 'A08629', 'A08922', 'A18003', 'A17870', 'A14822', 'A14611', 'A10724', 'A17393', 'A15818', 'A16049', 'A13064', 'A15034', 'A10793', 'A17486', 'A13122', 'A17869', 'A15043', 'A15817', 'A14485', 'A20229', 'A68555', 'A68413', 'A69006', 'A72174', 'A06436', 'A01731', 'A00222', 'A00473', 'A04997', 'A09876', 'A06347', 'A09772', 'A06817', 'A07100', 'A02793', 'A02408', 'A01724', 'A02494', 'A08770', 'A10114', 'A15443', 'A15103', 'A19140', 'A16209', 'A18158', 'A16784', 'A14709', 'A17221', 'A10867', 'A15724', 'A12589', 'A10530', 'A14464', 'A20392', 'B08102', 'B15269', 'A07054', 'A08452', 'A02968', 'A08239', 'A01764', 'A09659', 'A09223', 'A03124', 'A00508', 'A08305', 'A09063', 'A02157', 'A09431', 'A09049', 'A09134', 'A09862', 'A02216', 'A04988', 'A02951', 'A07474', 'A02967', 'A07401', 'A07840', 'A13678', 'A11811', 'A15621', 'A17036', 'A14882', 'A11462', 'A10811', 'A16433', 'A13105', 'A15040', 'A11025', 'A19397', 'A13102', 'A15339', 'A15036', 'A16152', 'A11683', 'A21166', 'A20496', 'A21238', 'A20476', 'A21010', 'A20501', 'A68668', 'A01797', 'A04809', 'A00218', 'A09952', 'A02966', 'A04811', 'A03700', 'A02984', 'A05048', 'A09766', 'A06706', 'A18707', 'A19398', 'A14579', 'A14467', 'A11933', 'A13808', 'A18998', 'A15061', 'A17676', 'A18708', 'A14030', 'A17642', 'A11924', 'A10113', 'A19753', 'A23563', 'A22778', 'A68991', 'A72732', 'B08014', 'B00400', 'B00562', 'B08027', 'A01504', 'A05090', 'A01685', 'A02132', 'A02141', 'A06166', 'A02974', 'A00563', 'A00947', 'A03702', 'A01097', 'A00234', 'A01073', 'A05025', 'A02099', 'A02457', 'A00819', 'A01225', 'A07960', 'A09222', 'A01227', 'A09117', 'A04561', 'A09418', 'A00801', 'A05089', 'A06163', 'A06589', 'A09596', 'A00505', 'A06145', 'A18959', 'A15467', 'A12367', 'A15498', 'A19386', 'A10376', 'A12341', 'A12369', 'A12365', 'A17319', 'A15762', 'A10786', 'A12392', 'A12774', 'A11299', 'A17151', 'A18384', 'A18762', 'A16246', 'A12345', 'A12226', 'A12371', 'A12347', 'A19795', 'A12390', 'A14338', 'A19179', 'A10308', 'A17516', 'A19385', 'A12366', 'A19411', 'A12350', 'A14602', 'A19899', 'A12628', 'A17009', 'A17005', 'A12348', 'A13065', 'A16912', 'A16731', 'A19794', 'A13964', 'A20467', 'A20900', 'A21328', 'A20818', 'A20216', 'A68750', 'A68278', 'A68948', 'A08893', 'A04535', 'A01787', 'A02333', 'A04260', 'A04515', 'A06687', 'A03016', 'A08907', 'A08323', 'A01735', 'A09353', 'A05586', 'A05494', 'A00209', 'A01597', 'A01177', 'A01169', 'A09697', 'A10250', 'A12701', 'A16599', 'A18330', 'A16421', 'A14584', 'A14032', 'A17029', 'A11498', 'A10306', 'A12775', 'A18250', 'A13331', 'A13252', 'A20505', 'A73952', 'A72180', 'B00960', 'B08187', 'B00425', 'A01224', 'A01847', 'A02143', 'A02952', 'A09593', 'A02153', 'A00611', 'A02421', 'A01681', 'A00642', 'A00778', 'A09665', 'A00249', 'A02779', 'A01145', 'A09123', 'A03731', 'A00634', 'A03961', 'A07038', 'A03349', 'A01231', 'A08875', 'A02443', 'A01864', 'A06825', 'A05074', 'A05269', 'A13392', 'A10413', 'A14176', 'A18766', 'A12273', 'A17143', 'A19742', 'A14191', 'A15622', 'A16691', 'A14184', 'A19026', 'A12679', 'A19128', 'A16774', 'A14190', 'A22071', 'A23383', 'A20121', 'A20123', 'A21207', 'A20130', 'A68296', 'A68487', 'A73530', 'A04501', 'A02671', 'A03893', 'A07719', 'A09315', 'A04608', 'A09920', 'A07765', 'A02023', 'A04146', 'A01628', 'A03573', 'A09292', 'A06667', 'A05066', 'A03851', 'A04955', 'A09212', 'A13880', 'A10233', 'A13565', 'A19658', 'A17239', 'A12592', 'A12691', 'A13563', 'A13611', 'A13654', 'A10110', 'A14624', 'A13381', 'A18016', 'A18020', 'A19145', 'A12972', 'A20850', 'A68519', 'A73585', 'A72855', 'A73302', 'B00039', 'B00537', 'A01324', 'A02919', 'A02043', 'A05163', 'A06670', 'A06502', 'A02401', 'A05738', 'A09585', 'A06484', 'A02291', 'A03755', 'A03747', 'A02464', 'A07388', 'A01332', 'A06863', 'A02616', 'A07896', 'A06734', 'A03508', 'A03361', 'A07883', 'A01848', 'A06481', 'A00714', 'A09101', 'A04477', 'A09380', 'A02617', 'A00423', 'A09598', 'A01761', 'A00935', 'A16175', 'A15863', 'A11730', 'A10964', 'A11909', 'A18066', 'A13620', 'A16206', 'A16459', 'A19649', 'A16785', 'A13106', 'A12827', 'A18439', 'A15295', 'A22983', 'A21206', 'A23061', 'A23370', 'A22462', 'A22871', 'A20313', 'A22910', 'A68967', 'A68918', 'A68554', 'A68098', 'A68653', 'A68509', 'A01631', 'A04790', 'A08196', 'A08483', 'A03645', 'A06390', 'A06859', 'A00276', 'A08197', 'A07184', 'A08332', 'A05017', 'A06346', 'A15347', 'A19277', 'A19306', 'A13666', 'A10086', 'A17690', 'A11429', 'A14810', 'A18608', 'A15340', 'A17646', 'A17654', 'A19291', 'A15104', 'A17693', 'A12126', 'A17648', 'A72940', 'A72894', 'B07957', 'A07674', 'A07046', 'A08000', 'A05679', 'A01719', 'A05126', 'A08517', 'A05073', 'A07400', 'A03456', 'A06678', 'A00271', 'A02080', 'A06477', 'A00742', 'A00617', 'A06181', 'A07050', 'A00518', 'A02956', 'A09291', 'A02171', 'A02074', 'A08453', 'A02133', 'A09793', 'A01362', 'A07036', 'A07047', 'A01258', 'A07049', 'A09142', 'A02908', 'A02200', 'A09226', 'A02971', 'A02955', 'A01143', 'A02957', 'A07051', 'A12343', 'A18586', 'A19267', 'A15109', 'A15754', 'A12302', 'A18918', 'A10848', 'A16393', 'A15765', 'A19120', 'A17236', 'A10967', 'A19376', 'A10806', 'A16220', 'A12535', 'A18485', 'A16498', 'A15132', 'A15761', 'A20658', 'A20777', 'A20194', 'A22683', 'A68945', 'A68558', 'A68163', 'A69205', 'A68946', 'A03854', 'A00204', 'A09636', 'A09628', 'A07039', 'A04964', 'A06617', 'A00290', 'A07040', 'A07087', 'A01115', 'A02953', 'A07082', 'A05091', 'A01684', 'A03003', 'A07045', 'A02375', 'A09178', 'A07041', 'A03906', 'A02963', 'A01175', 'A01680', 'A09300', 'A19412', 'A12984', 'A11295', 'A15341', 'A15343', 'A11054', 'A10563', 'A13585', 'A14831', 'A17125', 'A13966', 'A14157', 'A12303', 'A21185', 'A20393', 'A69334', 'A68619', 'A69171', 'A73575', 'B00849', 'B07983', 'B14740', 'B14551', 'B14599', 'A02376', 'A01947', 'A06298', 'A08061', 'A00639', 'A02970', 'A08546', 'A02614', 'A06226', 'A05094', 'A09173', 'A08280', 'A00397', 'A06132', 'A09922', 'A05185', 'A08781', 'A08178', 'A16690', 'A18640', 'A18731', 'A18441', 'A15033', 'A19072', 'A19668', 'A19973', 'A19802', 'A15038', 'A12966', 'A14868', 'A14387', 'A19966', 'A15106', 'A14779', 'A20129', 'A20120', 'A68136', 'A03718', 'A08614', 'A03907', 'A01410', 'A08819', 'A00186', 'A07233', 'A03528', 'A03908', 'A05199', 'A19296', 'A14107', 'A19619', 'A19432', 'A15140', 'A17146', 'A13809', 'A14595', 'A16845', 'A20912', 'A20914', 'A73425', 'A72931', 'A73478', 'A72733', 'B07806', 'B07761', 'A04077', 'A00021', 'A08497', 'A09529', 'A08303', 'A02120', 'A06703', 'A02495', 'A02753', 'A00664', 'A02784', 'A00565', 'A07825', 'A03380', 'A09538', 'A01694', 'A00562', 'A09109', 'A09841', 'A04845', 'A07602', 'A02135', 'A09838', 'A09857', 'A02855', 'A08545', 'A02070', 'A09228', 'A02091', 'A13001', 'A16950', 'A13963', 'A19859', 'A16777', 'A10335', 'A14668', 'A13916', 'A19834', 'A12971', 'A16941', 'A19923', 'A11642', 'A16918', 'A12429', 'A13827', 'A18047', 'A12690', 'A10869', 'A17373', 'A12032', 'A19921', 'A18419', 'A18915', 'A16144', 'A19925', 'A19854', 'A19732', 'A20021', 'A22559', 'A68869', 'A68210', 'A69121', 'A71324', 'A72271', 'A72253', 'A09554', 'A03442', 'A04223', 'A04364', 'A04215', 'A01453', 'A08088', 'A06892', 'A05205', 'A04110', 'A02935', 'A07457', 'A07549', 'A08091', 'A08553', 'A07767', 'A02979', 'A00546', 'A09822', 'A09695', 'A06450', 'A09312', 'A13705', 'A16909', 'A15857', 'A19232', 'A13711', 'A17890', 'A16563', 'A10588', 'A18736', 'A10062', 'A19622', 'A15864', 'A18018', 'A12246', 'A16980', 'A16565', 'A20928', 'A20977', 'A20471', 'A73698', 'A72176', 'B07984', 'B00023', 'B07982', 'B00559', 'B01098', 'B13303', 'A08775', 'A00637', 'A09069', 'A06625', 'A02122', 'A04472', 'A00412', 'A08283', 'A01701', 'A02665', 'A06162', 'A02138', 'A01703', 'A09595', 'A02136', 'A01383', 'A06583', 'A09220', 'A05567', 'A03787', 'A07909', 'A00283', 'A00630', 'A03284', 'A06147', 'A03097', 'A00686', 'A19369', 'A17946', 'A18764', 'A10342', 'A14187', 'A14573', 'A14189', 'A10831', 'A14785', 'A14172', 'A11219', 'A14577', 'A16078', 'A18346', 'A17725', 'A16139', 'A10345', 'A18366', 'A22842', 'A23344', 'A21209', 'A09537', 'A05354', 'A01592', 'A07590', 'A05067', 'A06987', 'A06266', 'A01372', 'A01570', 'A15039', 'A11417', 'A12605', 'A13744', 'A17647', 'A11086', 'A19392', 'A14460', 'A19207', 'A13753', 'A19266', 'A15028', 'A14213', 'A16828', 'A17243', 'A20391', 'A22422', 'A73542', 'B00396', 'B00614', 'B12208', 'A01312', 'A04401', 'A01740', 'A01845', 'A09094', 'A03620', 'A00932', 'A06277', 'A00193', 'A05710', 'A08673', 'A08175', 'A07853', 'A06607', 'A03408', 'A00931', 'A00755', 'A07911', 'A01325', 'A04906', 'A04930', 'A07405', 'A02916', 'A08832', 'A07462', 'A01130', 'A00159', 'A07612', 'A01333', 'A00939', 'A04764', 'A00489', 'A11423', 'A14592', 'A18750', 'A19563', 'A12787', 'A11333', 'A14588', 'A18760', 'A18744', 'A18092', 'A18057', 'A14104', 'A18767', 'A17945', 'A19796', 'A19270', 'A11204', 'A13680', 'A18437', 'A19272', 'A13043', 'A20382', 'A22897', 'A68419', 'A72804', 'A09734', 'A08542', 'A00168', 'A06210', 'A04136', 'A03079', 'A06224', 'A09499', 'A09946', 'A06348', 'A03121', 'A08447', 'A06155', 'A12590', 'A14711', 'A11574', 'A17215', 'A18573', 'A16913', 'A17724', 'A17649', 'A10270', 'A16163', 'A18433', 'A10176', 'A19309', 'A10170', 'A19606', 'A12708', 'A11953', 'A17727', 'A22431', 'A21163', 'A69054', 'A68828', 'A72063', 'B00482', 'B00052', 'B00330', 'B00830', 'B11289', 'B15101', 'B11202', 'A01371', 'A07826', 'A06164', 'A06672', 'A02649', 'A02645', 'A08309', 'A05188', 'A05184', 'A06183', 'A07908', 'A07822', 'A03921', 'A02798', 'A02281', 'A06673', 'A04555', 'A07686', 'A06169', 'A02230', 'A00823', 'A09400', 'A06401', 'A09189', 'A08003', 'A07618', 'A00401', 'A02644', 'A09388', 'A08271', 'A08904', 'A07483', 'A08304', 'A06171', 'A02652', 'A05729', 'A06962', 'A07919', 'A18752', 'A16198', 'A12781', 'A17337', 'A11028', 'A13300', 'A18910', 'A12778', 'A19172', 'A19443', 'A19191', 'A15998', 'A11268', 'A18428', 'A15748', 'A10354', 'A19931', 'A18329', 'A17707', 'A18741', 'A19439', 'A17154', 'A11685', 'A18763', 'A13069', 'A11735', 'A11573', 'A12550', 'A13823', 'A20853', 'A20410', 'A20829', 'A20114', 'A69361', 'A68315', 'A05203', 'A03464', 'A04889', 'A04813', 'A07288', 'A08430', 'A09733', 'A01491', 'A06630', 'A09676', 'A18072', 'A17599', 'A11250', 'A13168', 'A16720', 'A14226', 'A15772', 'A14430', 'A16195', 'A16436', 'A15779', 'A12779', 'A10594', 'A21144', 'A20489', 'A20782', 'A21147', 'A69339', 'A73047', 'A73748', 'A72740', 'A72549', 'B00565', 'B07428', 'B00087', 'B12205', 'A05182', 'A01774', 'A01125', 'A02366', 'A02096', 'A01629', 'A07176', 'A01353', 'A09407', 'A02201', 'A01286', 'A04793', 'A01718', 'A09998', 'A05312', 'A07769', 'A09592', 'A06227', 'A02479', 'A09001', 'A02961', 'A02273', 'A02795', 'A03804', 'A14818', 'A10583', 'A19965', 'A19623', 'A10716', 'A16796', 'A19957', 'A11861', 'A18769', 'A16832', 'A15047', 'A12581', 'A10957', 'A11555', 'A16508', 'A10414', 'A15032', 'A12622', 'A19751', 'A20591', 'A68197', 'A68202', 'A68498', 'A07158', 'A05358', 'A04954', 'A09298', 'A01262', 'A02986', 'A08789', 'A05463', 'A09826', 'A14216', 'A15334', 'A19998', 'A16466', 'A11529', 'A10614', 'A18639', 'A11949', 'A15337', 'A13761', 'A16255', 'A18435', 'A10111', 'A14024', 'A20531', 'A68628', 'A68407', 'A73873', 'A02976', 'A05059', 'A09043', 'A06713', 'A09411', 'A02184', 'A03860', 'A07836', 'A07559', 'A02180', 'A01502', 'A00945', 'A02794', 'A04499', 'A04542', 'A09753', 'A03312', 'A03890', 'A01161', 'A03963', 'A06278', 'A09232', 'A09097', 'A05575', 'A04549', 'A09930', 'A06167', 'A08363', 'A01952', 'A06960', 'A04586', 'A05410', 'A15516', 'A10726', 'A11272', 'A19322', 'A17246', 'A12772', 'A11262', 'A11270', 'A16401', 'A12633', 'A16970', 'A12650', 'A12361', 'A18770', 'A13068', 'A19797', 'A10813', 'A12363', 'A18748', 'A17165', 'A12224', 'A16972', 'A17420', 'A12773', 'A16758', 'A18417', 'A17259', 'A12634', 'A17328', 'A10816', 'A19462', 'A12472', 'A10109', 'A19942', 'A18431', 'A10810', 'A11019', 'A19937', 'A16202', 'A11269', 'A21170', 'A20436', 'A20813', 'A68961', 'A68287', 'A72464', 'A72208', 'A07957', 'A03344', 'A09819', 'A00804', 'A06230', 'A03779', 'A05517', 'A09576', 'A07918', 'A09574', 'A06228', 'A08202', 'A05336', 'A03949', 'A05412', 'A08436', 'A00464', 'A01501', 'A06229', 'A02334', 'A18432', 'A14040', 'A11401', 'A11537', 'A16523', 'A13339', 'A11406', 'A18901', 'A11541', 'A15344', 'A18686', 'A17261', 'A11027', 'A12225', 'A22691', 'A72506', 'B00475', 'B07516', 'B07677', 'B11895', 'B12663', 'A06168', 'A00538', 'A06916', 'A03903', 'A00259', 'A06170', 'A04803', 'A09227', 'A09391', 'A02775', 'A03321', 'A02117', 'A05183', 'A00700', 'A07929', 'A09473', 'A09410', 'A01716', 'A00946', 'A06203', 'A00983', 'A02774', 'A04567', 'A04785', 'A09224', 'A03398', 'A12351', 'A14178', 'A12386', 'A18451', 'A19394', 'A11254', 'A13103', 'A16145', 'A12406', 'A14826', 'A18722', 'A18595', 'A15431', 'A12231', 'A18751', 'A10353', 'A18771', 'A17318', 'A12045', 'A12677', 'A18638', 'A19244', 'A19468', 'A20028', 'A20823', 'A68064', 'A68024', 'A72252', 'A09295', 'A09297', 'A01492', 'A02335', 'A00001', 'A06312', 'A01490', 'A02131', 'A03232', 'A08306', 'A15035', 'A18980', 'A13173', 'A11247', 'A18694', 'A17369', 'A12706', 'A22795', 'A22687', 'A20188', 'A68164', 'A72738', 'B08205']

In [7]:
#splitting texts for training/testing by index

#given in list of list of strings
training = []
#given list of strings
testing = []
testlen = 0
for t in bigramtokens:
    #for just 50:50 splitting
    # if bigramtokens.index(t)%2==1:
    if bigramtokens.index(t)%2==0:
    
    #for running period-specific training/testing
    # if bigramnames[bigramtokens.index(t)] not in testnames:
        training.append(t)
    else: 
        testlen += 1
        for word in t:
            testing.append(word)

print(len(training))
print(testlen)

427
427


In [8]:
#generating bigrams, can take a bit of time lmao

# training bigram model: parameters incl min count, threshold (from -1 to 1), scoring (npmi = more robust?), 
#  and connector words enabled to allow for longer, informative ngrams (e.g. 'trade and traffic')
bigrammodel = Phrases(training, min_count = 3, threshold=-0.5, scoring='npmi', connector_words=phrases.ENGLISH_CONNECTOR_WORDS)

# getting the frequency(?) of bigrams within testing set
bgcount = Counter(b for b in bigrammodel[testing] if len(b.split("_")) > 1 )

# printing top 20 most common bigrams
print(pd.DataFrame(dict(bgcount).values(), index=dict(bgcount).keys(), columns=['bigram frequency']).sort_values('bigram frequency', ascending=False).head(n=20))

            bigram frequency
they_have              16956
that_they              16174
say_that               10826
that_have               9903
will_not                9714
which_have              9100
which_they              8920
can_not                 8093
his_own                 7857
that_which              7509
have_not                7147
they_shall              6992
who_have                6817
they_will               6587
if_they                 6351
not_only                6167
when_they               6073
they_that               5962
they_may                5653
will_have               5600


In [33]:
# looking for specific bigrams based on a word of interest
searchword = 'tobacco'

#for outputting to txt file, specify here
bruh = '/srv/data/bigrams.txt'
bgoutfile = open(bruh,'a+')

searchbigrams = {}
for key in dict(bgcount).keys():
    if key.split('_')[0] == searchword or key.split('_')[-1] == searchword:
        #print (key, dict(bgcount)[key])
        #printing out to textfile yee
        bgoutfile.write(key + '\n')
        searchbigrams[key] = dict(bgcount)[key]

#nice printing, ordered by frequency
print(pd.DataFrame(searchbigrams.values(), index=searchbigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))

                    frequency
tobacco_which              29
tobacco_pipe               21
take_tobacco               12
tobacco_shop                9
tobacco_shall               8
tobacco_the_growth          7
tobacco_that                6
tobacco_they                6
their_tobacco               6
such_tobacco                5
tobacco_but                 5
tobacco_have                5
store_tobacco               5
leaf_tobacco                4
tobacco_tobacco             4
tobacco_make                4
good_tobacco                4
smoke_tobacco               4
tobacco_our                 4
use_tobacco                 4


In [12]:
# trigrams oof

trigrammodel = Phrases(bigrammodel[training], min_count = 3, threshold = -0.5, scoring='npmi', connector_words=phrases.ENGLISH_CONNECTOR_WORDS )

tgcount = Counter(t for t in trigrammodel[testing] if len(t.split("_")) > 2 )

print(pd.DataFrame(dict(tgcount).values(), index=dict(tgcount).keys(), columns=['trigram frequency']).sort_values('trigram frequency', ascending=False).head(n=20))


                   trigram frequency
all_the_rest                    1244
that_the_king                   1235
five_and_twenty                  750
all_the_world                    735
unto_the_king                    728
into_the_sea                     724
after_the_death                  691
gold_and_silver                  680
four_and_twenty                  663
little_and_little                639
that_the_pope                    625
about_the_year                   599
that_the_same                    572
two_and_twenty                   545
one_the_other                    544
which_the_king                   538
hot_and_dry                      530
that_the_lord                    498
than_the_other                   489
against_the_king                 488


In [40]:
# searching for terms of interest in trigram

# looking for specific bigrams based on a word of interest
searchword = 'tobacco'

#for outputting to txt file, specify here
scream = '/srv/data/trigrams.txt'
tgoutfile = open(scream,'a+')

searchtrigrams = {}
for key in dict(tgcount).keys():
    if key.split('_')[0] == searchword or key.split('_')[-1] == searchword or key.split('_')[1] == searchword:
        #print (key, dict(bgcount)[key])
        #writing to text file
        tgoutfile.write(key + '\n')
        
        searchtrigrams[key] = dict(tgcount)[key]

#nice printing, ordered by frequency
print(pd.DataFrame(searchtrigrams.values(), index=searchtrigrams.keys(), columns=['frequency']).sort_values('frequency', ascending=False).head(20))

                       frequency
tobacco_the_growth             7
tobacco_and_other              3
fur_and_tobacco                3
tobacco_the_excessive          1
water_and_tobacco              1
tobacco_and_many               1


In [None]:
# printing out context windows for a given ngram, double check which dataset used for bigram generation (nostop or stop)
# if you need TCP context windows, use select.py to create a new folder, then grep the specific term

# add spaces before and after bigram if you are looking for two very specific words, e.g. "angel men" and not "angel mentions"
searchgram = 'serpent drug'

# accounting for flipped instances of bigrams
flipsearch = searchgram.split(' ')[1]+' '+searchgram.split(' ')[0]

names = []
for text in bigramtexts:
    if (searchgram in text) or (flipsearch in text):
        name = bigramnames[bigramtexts.index(text)]
        names.append(name)
        indices = [i for i in range(len(text)) if text.startswith(searchgram, i)]
        for index in indices:
            print(name+':', text[(index-100):(index+100)])      
        flipindices = [i for i in range(len(text)) if text.startswith(flipsearch, i)]
        for index in flipindices:
            print(name+':', text[(index-100):(index+100)]) 
print(names, len(names))           