In [41]:
import os
import glob
import numpy as np
import pandas as pd

import sys
sys.path.append('../src')

from embedding_functions import *

from sklearn.decomposition import PCA
from scipy.stats import gaussian_kde

In [None]:
# data import
data_path = './../data/raw/20news-bydate/20news-bydate-train'

In [None]:
# get list of category directories
categories = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

# create lists to store texts and corresponding categories
texts = []
labels = []

# Loop through each category directory to read the text files and assign the appropriate label.
for category in categories:
    category_path = os.path.join(data_path, category)

    # import all text files in the current category folder
    file_paths = glob.glob(os.path.join(category_path, '*'))

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            texts.append(text)
            labels.append(category)

# create data frame
df = pd.DataFrame({'text': texts, 'category': labels})

In [None]:
embeddings = []

for idx, text in enumerate(df['text']):
    try:
        embedding = get_embedding(text)
    except Exception as e:
        print(f"Error generating embedding for index {idx}: {e}")
    
    embeddings.append(embedding)

    # if necessary ... short delay to avoid API rate limits
    # time.sleep(0.1)

# add embeddings to data frame
df['embedding'] = embeddings

#### Re-Import Processed Data

In [None]:
# df = pd.read_pickle('./../data/processed/train_with_embeddings.pkl')

#### Conduct Principal Component Analysis

First, convert embeddings to a NumPy array

In [19]:
embedding_array = np.array(df['embedding'].tolist())

Next, initialize and fit PCA to reduce to 2 dimensions.

In [20]:
pca_2d = PCA(n_components=2)

components = pca_2d.fit_transform(embedding_array)

Add the new components as X and Y columns in the data frame.

In [21]:
df['x_2d'] = components[:, 0]
df['y_2d'] = components[:, 1]

Next, repeat PCA but for 3 dimensions, and add those to the data frame.

In [22]:
pca_3d = PCA(n_components=3)

components = pca_3d.fit_transform(embedding_array)

df['x_3d'] = components[:, 0]
df['y_3d'] = components[:, 1]
df['z_3d'] = components[:, 2]

Export the final data frame.

In [23]:
# save data frame as pickle file in 'processed' folder
df.to_pickle('./../data/processed/train_with_embeddings.pkl')

#### Create Overlap Matrix

##### Generate Kernel Density Estimates

Group the data by category and convert the 2D coordinates to a 2 x n array for each group.

In [24]:
kdes = {}

categories = df['category'].unique()

for cls in categories:
    subset = df[df['category'] == cls]

    # create a 2 x n_points array with x and y coordinates

    points = np.vstack([subset['x_2d'], subset['y_2d']])

    kdes[cls] = gaussian_kde(points)

Define a grid over the embedding space using the `x_2d` and `y_2d` columns

In [25]:
xmin, xmax = df['x_2d'].min() - 1, df['x_2d'].max() + 1
ymin, ymax = df['y_2d'].min() - 1, df['y_2d'].max() + 1

grid_size = 100  # resolution of the grid
xgrid = np.linspace(xmin, xmax, grid_size)
ygrid = np.linspace(ymin, ymax, grid_size)
X, Y = np.meshgrid(xgrid, ygrid)
grid_coords = np.vstack([X.ravel(), Y.ravel()])  # shape: (2, grid_size^2)

Evaluate each class' KDE on the grid

In [26]:
density = {}

for cls, kde in kdes.items():
    # evaluate the KDE on the grid and reshape back to grid form

    density[cls] = kde(grid_coords).reshape(X.shape)

##### Compute KDE Overlap Coefficient

Determine the area element (dx * dy):

In [27]:
dx = xgrid[1] - xgrid[0]
dy = ygrid[1] - ygrid[0]

Loop over all pairs of classes using sorted category labels.

In [32]:
overlap_results = {}

# classes = sorted(density.keys())
classes = sorted(df['category'].unique())

for i in range(len(classes)):
    for j in range(i + 1, len(classes)):
        cls1 = classes[i]
        cls2 = classes[j]

        # compute the pointwise minimum of the two density grids
        min_density = np.minimum(density[cls1], density[cls2])

        # approximate the integral over the grid as the sum of the minimum density
        overlap_coef = np.sum(min_density) * dx * dy

        overlap_results[(cls1, cls2)] = overlap_coef


# classes = sorted(df['category'].unique())
n_classes = len(classes)

overlap_matrix = np.full((n_classes, n_classes), np.nan)

# fill the matrix with the computed overlaps
for (cls1, cls2), coef in overlap_results.items():
    # find the index for each class in the sorted list
    i = classes.index(cls1)
    j = classes.index(cls2)

    overlap_matrix[i, j] = coef
    # overlap_matrix[j, i] = coef # optional: leaving this out for now to create an upper diagonal matrix

# set the diagonal to 1
# np.fill_diagonal(overlap_matrix, 1) # skipping this for now as the diagonal grabs too much attention

# create DF with labels for the heat map
df_overlap = pd.DataFrame(overlap_matrix,
                          index=[f"{c}" for c in classes],
                          columns=[f"{c}" for c in classes]
                          )

Export Overlap Matrix

In [34]:
df_overlap.to_csv('./../data/processed/df_overlap.csv')

In [33]:
df_overlap

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,,0.009499,0.003026,0.000998,0.00272,0.008735,0.014343,0.087514,0.052432,0.002376,0.000302,0.188272,0.025248,0.142451,0.071542,0.681024,0.530566,0.631426,0.405994,0.780711
comp.graphics,,,0.7582,0.827906,0.832623,0.586995,0.363519,0.084485,0.075652,0.006135,0.011652,0.073598,0.729393,0.206431,0.186172,0.059276,0.024649,0.027679,0.027303,0.035288
comp.os.ms-windows.misc,,,,0.793333,0.755274,0.738008,0.30195,0.04366,0.038589,0.002044,0.008246,0.078903,0.638372,0.110876,0.107298,0.028132,0.009065,0.00808,0.009421,0.018003
comp.sys.ibm.pc.hardware,,,,,0.842571,0.639562,0.350449,0.038119,0.032079,0.003082,0.007702,0.049068,0.613764,0.105162,0.098363,0.024605,0.009145,0.010062,0.009313,0.011341
comp.sys.mac.hardware,,,,,,0.543305,0.393937,0.052412,0.0461,0.003326,0.009649,0.036186,0.629677,0.123557,0.1207,0.023265,0.011829,0.010965,0.012233,0.010648
comp.windows.x,,,,,,,0.175418,0.040766,0.039899,0.003598,0.007737,0.077755,0.502891,0.098817,0.087703,0.039692,0.017205,0.018104,0.019498,0.023434
misc.forsale,,,,,,,,0.26946,0.224251,0.019526,0.019202,0.033076,0.373095,0.155256,0.200973,0.045494,0.028905,0.031439,0.036786,0.022719
rec.autos,,,,,,,,,0.769229,0.018277,0.017279,0.033169,0.186974,0.37235,0.534378,0.100815,0.13352,0.126046,0.232715,0.069419
rec.motorcycles,,,,,,,,,,0.017237,0.014447,0.019588,0.174603,0.382149,0.54478,0.066288,0.11777,0.099922,0.281557,0.0309
rec.sport.baseball,,,,,,,,,,,0.740164,0.002779,0.010579,0.013527,0.010035,0.006324,0.007534,0.004373,0.019158,0.00326
