## Imports and collecting pre-processed data
We will review the cleaned metadata information, including descriptions primarily

In [None]:
import sys
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

# Import utils from subfolder of project, works for immediate subfolders of PROJECT_ROOT
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..")) # adjust relative import as necessary
sys.path.append(PROJECT_ROOT)
SELECTED_CATEGORY = 'Video_Games'
from utils.data_processing import get_metadata

In [25]:
def write_array(data: np.array, num_slices: int = 5, folder: str = "description-embeddings") -> None:
    """
    Creates a set of pickle files to break down a numpy array
    """
    rows_per_slice = math.ceil(data.shape[0] / num_slices)
    for i in range(num_slices):
        with open(f"{folder}/slice-{i}.pkl", "wb") as f:
            pickle.dump(data[rows_per_slice*i:rows_per_slice*(i+1)], f)

def read_array(num_slices: int = 5, folder: str = "description-embeddings") -> np.array:
    """
    Creates a numpy array from a set of sliced pickle files
    """
    arr = None
    for i in range(num_slices):
        with open(f"{folder}/slice-{i}.pkl", "rb") as f:
            slice = pickle.load(f)
            arr = np.vstack((arr, slice)) if arr is not None else slice
    return arr

In [None]:
categorical = ['store', 'categories']
continuous = ['average_rating', 'rating_number', 'price']
text_based = ['title', 'features', 'description']

# X_train, y_train, X_val, y_val, X_test, y_test = get_filtered_review_data(SELECTED_CATEGORY)
meta = get_metadata(SELECTED_CATEGORY)

embeddings_array = read_array()
embeddings = pd.DataFrame(embeddings_array)
embeddings['parent_asin'] = meta['parent_asin']
embeddings.head()

Loading metadata from data/Video_Games_metadata.pkl


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,parent_asin
0,-0.005333,0.007172,-0.048593,-0.036893,-0.044139,0.030708,0.02079,0.124238,-0.079286,0.038286,...,0.022913,-0.010473,-0.112838,0.023826,0.025811,0.049128,-0.008654,-0.006006,0.027029,B000FH0MHO
1,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,0.115433,0.000701,-0.085925,-0.070654,...,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717,B00069EVOG
2,-0.035121,0.052389,-0.006503,-0.148511,0.003149,-0.006528,-0.022535,-0.038059,0.031254,0.09664,...,-0.000769,-0.043425,0.06195,0.030464,-0.07627,0.143601,-0.035178,-0.015723,0.026499,B00Z9TLVK0
3,-0.118838,0.048299,-0.002548,-0.011011,0.051951,0.010292,0.115433,0.000701,-0.085925,-0.070654,...,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717,B07SZJZV88
4,-0.080119,-0.035443,-0.041519,-0.050098,0.013094,0.065544,0.045988,0.096835,-0.03141,0.050701,...,0.10498,-0.023695,-0.04914,0.01395,-0.01162,-0.039259,-0.11561,-0.000405,0.007943,B002WH4ZJG


In [None]:
# Code to generate the descriptions from scratch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

descriptions = meta.loc[:, 'description'].values
desc_embeddings = model.encode(descriptions)

write_array(desc_embeddings)

## Tangent - Using OpenAI's text embedding models via API
This is a more expensive but potentially more accurate method to generate embeddings.
This option is available, but we'll explore free options first. (also models that have better documentation)

In [None]:
from openai import OpenAI
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("API_KEY")
ORGANIZATION = os.getenv("ORGANIZATION")
PROJECT = os.getenv("PROJECT")

client = OpenAI(
  organization=ORGANIZATION,
  project=PROJECT,
  api_key=API_KEY
)

response = client.embeddings.create(
    model="text-embedding-3-small", input="example string", encoding_format="float", 
)

cut_dim = response.data[0].embedding[:256]
print(cut_dim)

In [None]:
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

norm_dim = normalize_l2(cut_dim)
print(norm_dim)

In [None]:
# Use tokenizer to assert that the API will not be rate limited
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

enc = tiktoken.encoding_for_model("text-embedding-3-small")