In [18]:
import os
import glob
import numpy as np
import pandas as pd

import sys
sys.path.append('../src')

import embedding_functions

from sklearn.decomposition import PCA
from scipy.stats import gaussian_kde

In [None]:
# data import
data_path = './../data/raw/20news-bydate/20news-bydate-train'

In [None]:
# get list of category directories
categories = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

# create lists to store texts and corresponding categories
texts = []
labels = []

# Loop through each category directory to read the text files and assign the appropriate label.
for category in categories:
    category_path = os.path.join(data_path, category)

    # import all text files in the current category folder
    file_paths = glob.glob(os.path.join(category_path, '*'))

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            texts.append(text)
            labels.append(category)

# create data frame
df = pd.DataFrame({'text': texts, 'category': labels})

In [None]:
embeddings = []

for idx, text in enumerate(df['text']):
    try:
        embedding = get_embedding(text)
    except Exception as e:
        print(f"Error generating embedding for index {idx}: {e}")
    
    embeddings.append(embedding)

    # if necessary ... short delay to avoid API rate limits
    # time.sleep(0.1)

# add embeddings to data frame
df['embedding'] = embeddings

#### Re-Import Processed Data

In [None]:
# df = pd.read_pickle('./../data/processed/train_with_embeddings.pkl')

#### Conduct Principal Component Analysis

First, convert embeddings to a NumPy array

In [19]:
embedding_array = np.array(df['embedding'].tolist())

Next, initialize and fit PCA to reduce to 2 dimensions.

In [20]:
pca_2d = PCA(n_components=2)

components = pca_2d.fit_transform(embedding_array)

Add the new components as X and Y columns in the data frame.

In [21]:
df['x_2d'] = components[:, 0]
df['y_2d'] = components[:, 1]

Next, repeat PCA but for 3 dimensions, and add those to the data frame.

In [22]:
pca_3d = PCA(n_components=3)

components = pca_3d.fit_transform(embedding_array)

df['x_3d'] = components[:, 0]
df['y_3d'] = components[:, 1]
df['z_3d'] = components[:, 2]

Export the final data frame.

In [23]:
# save data frame as pickle file in 'processed' folder
df.to_pickle('./../data/processed/train_with_embeddings.pkl')

#### Create Overlap Matrix

##### Generate Kernel Density Estimates

Group the data by category and convert the 2D coordinates to a 2 x n array for each group.

In [24]:
kdes = {}

categories = df['category'].unique()

for cls in categories:
    subset = df[df['category'] == cls]

    # create a 2 x n_points array with x and y coordinates

    points = np.vstack([subset['x_2d'], subset['y_2d']])

    kdes[cls] = gaussian_kde(points)

Define a grid over the embedding space using the `x_2d` and `y_2d` columns

In [25]:
xmin, xmax = df['x_2d'].min() - 1, df['x_2d'].max() + 1
ymin, ymax = df['y_2d'].min() - 1, df['y_2d'].max() + 1

grid_size = 100  # resolution of the grid
xgrid = np.linspace(xmin, xmax, grid_size)
ygrid = np.linspace(ymin, ymax, grid_size)
X, Y = np.meshgrid(xgrid, ygrid)
grid_coords = np.vstack([X.ravel(), Y.ravel()])  # shape: (2, grid_size^2)

Evaluate each class' KDE on the grid

In [26]:
density = {}

for cls, kde in kdes.items():
    # evaluate the KDE on the grid and reshape back to grid form

    density[cls] = kde(grid_coords).reshape(X.shape)

##### Compute KDE Overlap Coefficient

Determine the area element (dx * dy):

In [27]:
dx = xgrid[1] - xgrid[0]
dy = ygrid[1] - ygrid[0]

Loop over all pairs of classes using sorted category labels.

In [29]:
overlap_results = {}

classes = sorted(density.keys())

for i in range(len(classes)):
    for j in range(i + 1, len(classes)):
        cls1 = classes[i]
        cls2 = classes[j]

        # compute the pointwise minimum of the two density grids
        min_density = np.minimum(density[cls1], density[cls2])

        # approximate the integral over the grid as the sum of the minimum density
        overlap_coef = np.sum(min_density) * dx * dy

        overlap_results[(cls1, cls2)] = overlap_coef


classes = sorted(df['category'].unique())
n_classes = len(classes)

overlap_matrix = np.full((n_classes, n_classes), np.nan)

# fill the matrix with the computed overlaps
for (cls1, cls2), coef in overlap_results.items():
    # find the index for each class in the sorted list
    i = classes.index(cls1)
    j = classes.index(cls2)

    overlap_matrix[i, j] = coef
    # overlap_matrix[j, i] = coef # optional: leaving this out for now to create an upper diagonal matrix

# set the diagonal to 1
# np.fill_diagonal(overlap_matrix, 1) # skipping this for now as the diagonal grabs too much attention

# create DF with labels for the heat map
df_overlap = pd.DataFrame(overlap_matrix,
                          index=[f"{c}" for c in classes],
                          columns=[f"{c}" for c in classes]
                          )

Export Overlap Matrix

In [30]:
df_overlap.to_csv('./../data/processed/df_overlap.csv')