In [18]:
import os
import glob
import numpy as np
import pandas as pd

import sys
sys.path.append('../src')

import embedding_functions

from sklearn.decomposition import PCA
from scipy.stats import gaussian_kde

In [None]:
# data import
data_path = './../data/raw/20news-bydate/20news-bydate-train'

In [None]:
# get list of category directories
categories = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]

# create lists to store texts and corresponding categories
texts = []
labels = []

# Loop through each category directory to read the text files and assign the appropriate label.
for category in categories:
    category_path = os.path.join(data_path, category)

    # import all text files in the current category folder
    file_paths = glob.glob(os.path.join(category_path, '*'))

    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
            texts.append(text)
            labels.append(category)

# create data frame
df = pd.DataFrame({'text': texts, 'category': labels})

In [None]:
embeddings = []

for idx, text in enumerate(df['text']):
    try:
        embedding = get_embedding(text)
    except Exception as e:
        print(f"Error generating embedding for index {idx}: {e}")
    
    embeddings.append(embedding)

    # if necessary ... short delay to avoid API rate limits
    # time.sleep(0.1)

# add embeddings to data frame
df['embedding'] = embeddings

#### Re-Import Processed Data

In [15]:
df = pd.read_pickle('./../data/processed/train_with_embeddings.pkl')

#### Conduct Principal Component Analysis

First, convert embeddings to a NumPy array

In [19]:
embedding_array = np.array(df['embedding'].tolist())

Next, initialize and fit PCA to reduce to 2 dimensions.

In [20]:
pca_2d = PCA(n_components=2)

components = pca_2d.fit_transform(embedding_array)

Add the new components as X and Y columns in the data frame.

In [21]:
df['x_2d'] = components[:, 0]
df['y_2d'] = components[:, 1]

Next, repeat PCA but for 3 dimensions, and add those to the data frame.

In [22]:
pca_3d = PCA(n_components=3)

components = pca_3d.fit_transform(embedding_array)

df['x_3d'] = components[:, 0]
df['y_3d'] = components[:, 1]
df['z_3d'] = components[:, 2]

Export the final data frame.

In [23]:
# save data frame as pickle file in 'processed' folder
df.to_pickle('./../data/processed/train_with_embeddings.pkl')