In [None]:
'''
Unsupervised k-means clustering on plain text file versions of EEBO-TCP texts. 
The plain text files are generated by running stageOne.py 

Produces a user-specified number of clusters based on term frequency vectorization. 
These clusters are described using the keywords found in their corresponding entries 
in a metadata CSV file also made using stageOne.py 

The vectorize and cluster functions are adapted from the EarlyPrint Lab: 
    https://earlyprint.org/jupyterbook/unsupervised.html
The pca function is adapted from Ask Python: 
    https://www.askpython.com/python/examples/plot-k-means-clusters-python 
'''
import numpy as np
import pandas as pd
from collections import defaultdict,Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# import our own utility functions 
from functions import remove_stopwords,getTexts,keywords


def vectorize(strings,ids):
    # comment below is from EarlyPrint 
    # sublinear term frequency scaling takes the log of
    # term frequencies and can help to de-emphasize function words like pronouns and articles. 
    tfidf = TfidfVectorizer(min_df=2, sublinear_tf=True)

    # Commented out portion below is for removing stopwords 
    processed = remove_stopwords(strings)
    newStrings = []
    for doc in processed:
        str = ' '.join(doc)
        newStrings.append(str)
    results = tfidf.fit_transform(newStrings)
    
    # results = tfidf.fit_transform(strings) #comment this out if you choose to remove stopwords 
    return pd.DataFrame(results.toarray(), index=ids, columns=tfidf.get_feature_names_out()) # Convert information back to a DataFrame

def pca(kmeans,df):
    '''
    PCA visualization code comes from https://www.askpython.com/python/examples/plot-k-means-clusters-python 
    '''
    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(df)
    label = kmeans.fit_predict(pca_results)
    u_labels = np.unique(label)
    for i in u_labels:
        plt.scatter(pca_results[label == i , 0] , pca_results[label == i , 1] , label = i)
    plt.legend()
    plt.show()

def cluster(df,ids,num):
    # Create a KMeans instance that will look for a specified number of clusters
    # Random_state parameter helps for reproducibility. 42 is a common choice 
    kmeans = KMeans(n_clusters=num, random_state=42) 
    kmeans.fit(df) 
    pca(kmeans,df)

    kmeans_groups = defaultdict(list)
    for k,v in zip(kmeans.labels_,ids):
        kmeans_groups[k].append(v)
        
    for k,v in kmeans_groups.items():
        print(k, ': ',v)

    return kmeans_groups

In [None]:
folder = '/srv/data/VirginiaEPTextClean'
csvFile = '/srv/data/metadata/tuning/virginia.csv'
info = getTexts(folder)
df = vectorize(info[0],info[1])
groups = cluster(df,info[1],4)

In [None]:
words = keywords(csvFile)
for k,v in groups.items():
    print(f'Group {k}')
    keyterms = []
    for name in v: 
        if '_' not in name:
            keyterms.extend(words[name][0])
    print(Counter(keyterms))