In [25]:
'''
https://earlyprint.org/jupyterbook/unsupervised.html
'''
# General Libraries Needed
import csv
import os 
import pandas as pd
from collections import defaultdict, Counter

# Functions for Unsupervised Clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Libraries for Graphing
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

def getTexts(folder):
    '''
    Takes in plain text files and outputs a tuple of lists, with the first being the text
    within each file as a string and the second list being the IDs of each text. 
    '''
    textStrings = []
    fileNames = []
    for file in os.listdir(folder):
        path = os.path.join(folder,file)
        f = open(path,'r')
        text = f.readlines()[0]
        textStrings.append(text)
        name = file.split('.')[0]
        fileNames.append(name)
        f.close()
    return textStrings,fileNames

def vectorize(strings,ids):
    # create vectorizer instance w/ normalization set to l2 by default 
    tfidf = TfidfVectorizer(min_df=2, sublinear_tf=True)
    # I am choosing to turn on sublinear term frequency scaling, which takes the log of
    # term frequencies and can help to de-emphasize function words like pronouns and articles. 
    # You might make a different choice depending on your corpus.

    # Once we've created the instance, we can "transform" our counts
    results = tfidf.fit_transform(strings)
    # Make results readable using Pandas
    return pd.DataFrame(results.toarray(), index=ids, columns=tfidf.get_feature_names_out()) # Convert information back to a DataFrame

def cluster(df,ids,num):
    # Create a KMeans instance that will look for a specified number of clusters
    # TODO: understand what the random_state parameter does  
    kmeans = KMeans(n_clusters=num, random_state=42) 
    kmeans.fit(df) # Feed in our normalized data

    kmeans_groups = defaultdict(list)
    for k,v in zip(kmeans.labels_,ids):
        kmeans_groups[k].append(v)
        
    for v in kmeans_groups.values():
        print(v)

    # Calculate PCA
    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(df)

    # Put PCA into a DataFrame
    pca_df = pd.DataFrame(pca_results, index=ids, columns=["pc1","pc2"])

    # Add "color" column for K-Means groups
    pca_df['color'] = pd.Series(kmeans.labels_, index=ids)
    pca_df.plot.scatter(x='pc1', y='pc2', c='color', colormap='tab10', colorbar=False)
    return kmeans_groups

def keywords(csv,groups):
    df = pd.read_csv(csv)
    keywords = df['keywords']
    ids = df['id']
    numFiles = len(ids)
    count = 0
    dict = {}
    while count < numFiles:
        words = set(keywords[count].split('--'))
        words.discard('')
        dict[ids[count]] = words
        count += 1
    for k,v in groups.items():
        print(f'Group {k+1}')
        keyterms = []
        for name in v: 
            keyterms.extend(dict[name])
        print(Counter(keyterms))

In [26]:
folder = '/srv/data/texts'
csv = '/srv/data/CSVs/EPtuning.csv'
info = getTexts(folder)
df = vectorize(info[0],info[1])
groups = cluster(df,info[1],4)
keywords(csv,groups)