In [1]:
import numpy as np
from os.path import join,isfile
from os import listdir
from tqdm import tqdm
import json
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr,spearmanr,percentileofscore
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from transformers import BertTokenizer

  from pandas import Int64Index as NumericIndex


In [2]:
layer = 12
min_samples = 10
embedding_dir = join('data','embeddings')
filter_labels = {'Evans Early American Imprints','HeinOnline','National Archives Founders Online'}
#filter_labels = {'Evans Early American Imprints':0,'HeinOnline':1,'National Archives Founders Online':2}
file_header  =  "cofea_sampled_vectors_" #'cofea_histbert_vectors_'
embedding_files = [f for f in listdir(embedding_dir) if isfile(join(embedding_dir, f))
                  and file_header in f]
indir = join('data','preprocessed')
outdir = join('data','preprocessed')
special_terms_file = join('data','interest_terms.txt')
target_index_file = join(indir,'sample_target_index.dict')
data_file = join(indir, 'cofea.jsonlist')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# get the file info for cofea
with open(data_file) as f:
     cofea_data = f.readlines()
# get the document index of the embeddings
with open(target_index_file,'rb') as f:
    target_index = pickle.load(f)
    
# list of terms of interest
with open(special_terms_file, 'r',encoding = 'utf-8') as f:
    special_terms = f.read().splitlines()
    
# they were indexed and saved in their tokenized form 
special_terms = [tokenizer.tokenize(x) for x in special_terms]

special_terms_cleaned = []
for x in special_terms:
    # rejoin into concatenated words
    rejoined_pieces = []
    for p_i, piece in enumerate(x):
        if p_i == 0:
            rejoined_pieces.append(piece)
        elif piece.startswith('##'):
            rejoined_pieces[-1] += piece
        else:
            rejoined_pieces.append(piece)
    special_terms_cleaned.append(' '.join(rejoined_pieces))
special_terms_cleaned = set(special_terms_cleaned)

In [None]:
# Variation Across Subsets
# measure silhouette scores using the subset (Founders, Evans, or Hein) as labels
# those that have high silhouette scores have very different contexts in the various subsets
# again, look at those terms that vary the most, and terms of interest that 
# have high silhouette scores (in terms of quantiles), vs those that are low

# provide labels to embeddings based on source
subset_eval_scores = {}
subset_target_labels = {}

for target_file in tqdm(embedding_files):
    target_word = target_file.replace(file_header,'')
    target_word = target_word.replace('.dict','')
    # get embeddings
    with open(join(embedding_dir,target_file),'rb') as f:
        target_embeddings = pickle.load(f)
    embeddings = target_embeddings[layer]
    # get labels and only save embeddings that are specific sources
    labels = []
    filter_embeddings = []
    for x,index in enumerate(target_index[target_word]):
        _,doc_index,_ = index
        doc = cofea_data[doc_index]
        doc = json.loads(doc)
        source = doc['source']
        if source in filter_labels:
            labels.append(source)
            filter_embeddings.append(embeddings[x])
    # get the silhouette score
    if len(filter_embeddings ) > min_samples and len(set(labels))>1:
        X = np.array(filter_embeddings)
        # save the clustering labels and scores
        subset_eval_scores[target_word] = silhouette_score(X, labels)
        subset_target_labels[target_word] = labels



 99%|████████████████████████████████████▌| 1075/1088 [1:02:05<00:36,  2.83s/it]

In [None]:
with open(join(outdir,file_header+'sillhouette_scores_subset_variation.dict'),'wb') as f:
    pickle.dump(subset_eval_scores,file=f)
with open(join(outdir,file_header+'subset_labels.dict'),'wb') as f:
    pickle.dump(subset_target_labels,file=f)


In [6]:
# Variation over all
# use the same contextual embeddings and pool Evans Hein and Founders together
# use k-means clustering with k=2, then compute silhouette scores on those clusters
# this will help to pick out those that have at least two distinct meanings
# may want to further split those about some silhouette threshold and check the silhouette scores of the resulting clusters
# again can identify terms that seem to show a lot vs a little variation in usage / contexts (again with a focus on terms of interest that might require more investigation)


overall_eval_scores = {}
overall_target_labels = {}
for target_file in tqdm(embedding_files):
    target_word = target_file.replace(file_header,'')
    target_word = target_word.replace('.dict','')

    with open(join(embedding_dir,target_file),'rb') as f:
        target_embeddings = pickle.load(f)
        
    target_embeddings = target_embeddings[layer]
    if len(target_embeddings ) > min_samples:
        X = np.array(target_embeddings)
        kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

        # check if there are at least 4 items in each cluster
        c1 = 0
        for x in kmeans.labels_:
            if x == 0:
                c1 += 1
                
        if c1 >= 4 :
            # sihouette score
            overall_eval_scores[target_word] = silhouette_score(X, kmeans.labels_)
        
        
        # save the clustering labels
        overall_target_labels[target_word] = kmeans.labels_



100%|███████████████████████████████████████| 1088/1088 [35:19<00:00,  1.95s/it]


In [7]:
with open(join(outdir,file_header+'silhouette_scores_over_all_variation.dict'),'wb') as f:
    pickle.dump(overall_eval_scores,file=f)
with open(join(outdir,file_header+'kmeans_labels.dict'),'wb') as f:
    pickle.dump(overall_target_labels,file=f)