In [1]:
%pip install pandas chromadb openai tiktoken  sentence-transformers
import openai
import chromadb
import pandas as pd
from chromadb.utils import embedding_functions
from tiktoken import encoding_for_model
import os

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import os

#  load environment variables from .env file (this is for security reasons)
load_dotenv()

# get API key from the environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# import time
# # define embedding function using OpenAI text-embedding-ada-002 model
# def get_embeddings(texts):
#     try:
#         response = openai.embeddings.create(
#             model='text-embedding-ada-002',
#             input=texts
#         )
#         return [item['embedding'] for item in response['data']]
#     except openai.error.RateLimitError:
#         print("Rate limit exceeded. Waiting for 60 seconds before retrying...")
#         time.sleep(60)
#         return get_embeddings(texts)

In [4]:
# initialize the ChromaDB client
client = chromadb.Client()
# get 'text_embeddings' collection. if you don't have a collection created yet, create it
collection_name = 'text_embeddings'
if collection_name in [col.name for col in client.list_collections()]:
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(collection_name)

###
load all the files we want to do vector embeddings for

In [5]:
data = []

In [6]:
organized_blow_molding_basics_csv = r'organized_blow_molding_basics.csv'
organized_blow_molding_basics_df = pd.read_csv(organized_blow_molding_basics_csv)

organized_blow_molding_basics_df.head()
data.append(organized_blow_molding_basics_df)
print(organized_blow_molding_basics_df)


                                   section  \
0                             Introduction   
1                                   Resins   
2                                Processes   
3                  Injection Blow Moulding   
4              Injection Blow-Mold Tooling   
5   Troubleshooting Injection Blow Molding   
6         Setup for Injection Blow Tooling   
7                   Extrusion Blow-Molding   
8                                 Formulas   
9                         Die/Weight Swell   
10                    Heat Extraction Load   
11               Metals Used in Blow Molds   
12                               Shrinkage   
13                                 Venting   
14                    Stretch Blow Molding   
15           Other Blow-Molding Operations   
16                            BIBLIOGRAPHY   
17                      GENERAL REFERENCES   

                                                 text  
0   Blow molding is defined as a plastic process w...  
1   Most ther

In [7]:
organized_pet_bottle_faults_csv = r'organized_pet_bottle_faults.csv'
organized_pet_bottle_faults_df = pd.read_csv(organized_pet_bottle_faults_csv)

organized_pet_bottle_faults_df.head()
data.append(organized_pet_bottle_faults_df)
print(organized_pet_bottle_faults_df)


                                        contents  \
0                                   BLOWN FINISH   
1   SEALING SURFACE AND NECK SUPPORT RING DAMAGE   
2                              NECK CONSTRICTION   
3                               PERPENDICULARITY   
4                                  HARD SHOULDER   
5                                  THIN SHOULDER   
6                          OPALESCENCE (TOO WARM   
7                      EXCESSIVE FILL POINT DROP   
8                        PEARLESCENCE (TOO COLD)   
9                                     FLAT SIDES   
10                          PROMINENT MOULD SEAM   
11                                  CHOKED BODYV   
12                             HEAVY BASE WEIGHT   
13                             LIGHT BASE WEIGHT   
14                         FEET NOT FULLY FORMED   
15                       HOT SIDES (TOO WARM) on   
16                                  LOW TOP LOAD   
17                     LOW BURST PRESSURE (BODY)   
18          

In [8]:
organized_stressesinpebottles_csv = r'organized_stressesinpebottles.csv'
organized_stressesinpebottles_df = pd.read_csv(organized_stressesinpebottles_csv)

organized_stressesinpebottles_df.head()
data.append(organized_stressesinpebottles_df)
print(organized_stressesinpebottles_df)


                  section                                               text
0                ABSTRACT  A commercial blow-molding grade, high-density ...
1            INTRODUCTION  Commercial bottles must satisfy performance cr...
2            EXPERIMENTAL  A commercial blow-molding grade, high-density\...
3  RESULTS AND DISCUSSION  A typical three-dimensional thickness distribu...
4             CONCLUSIONS  The results of this study suggest that, in the...
5                 FIGURES  Fig. 1. Shear viscosity behavior of Resin D.\r...
6              REFERENCES  1. H. Frank and W. Wengler, Verpack Runds, 22,...


In [9]:
dupont_table_csv = r'Dupont_Table7.csv'
dupont_table_df = pd.read_csv(dupont_table_csv)

dupont_table_df.head()

data.append(dupont_table_df)
print(dupont_table_df)

                                                Fault  \
0                                     Bubbles in melt   
1                                     Bubbles in melt   
2                                  Poor melt strength   
3                                  Poor melt strength   
4   Excessive melt temperature (significantly abov...   
5   Excessive melt temperature (significantly abov...   
6              Unmelt or "cold" appearance in parison   
7              Unmelt or "cold" appearance in parison   
8              Unmelt or "cold" appearance in parison   
9              Unmelt or "cold" appearance in parison   
10             Unmelt or "cold" appearance in parison   
11  Inside surface of parison is rough (outside ma...   
12  Inside surface of parison is rough (outside ma...   
13  Outside surface of parison is rough (inside ma...   
14  Outside surface of parison is rough (inside ma...   
15                                    Poor pinch weld   
16                             

In [10]:
print(data)
print("length", len(data))

[                                   section  \
0                             Introduction   
1                                   Resins   
2                                Processes   
3                  Injection Blow Moulding   
4              Injection Blow-Mold Tooling   
5   Troubleshooting Injection Blow Molding   
6         Setup for Injection Blow Tooling   
7                   Extrusion Blow-Molding   
8                                 Formulas   
9                         Die/Weight Swell   
10                    Heat Extraction Load   
11               Metals Used in Blow Molds   
12                               Shrinkage   
13                                 Venting   
14                    Stretch Blow Molding   
15           Other Blow-Molding Operations   
16                            BIBLIOGRAPHY   
17                      GENERAL REFERENCES   

                                                 text  
0   Blow molding is defined as a plastic process w...  
1   Most the

In [11]:
def string_df(df):
    for name, obj in globals().items():
        if obj is df:
            return name  
    # if no dataframe is found, return an error message
    return "Error: no dataframe found"

In [12]:

from sentence_transformers import SentenceTransformer

# iniatilize the model
model = SentenceTransformer('all-MiniLM-L6-v2')


# columns to check and embed
columns_to_embed = ['text', 'Fault', 'Cause', 'Solution', 'cause', 'solution', 'description']

# store embeddings and metadata
embeddings_list = []
metadata_list = []

# loop through each df in the list
for dataframe in data:
    for column in columns_to_embed:
        if column in dataframe.columns:
            # list of text data from the column
            texts = dataframe[column].dropna().tolist()  # drop NaNs to avoid errors
            
            # embeddings for the list of texts
            embeddings = model.encode(texts)
            
            # store each embedding and associated metadata
            for i, embedding in enumerate(embeddings):
                metadata = {
                    'source': string_df(dataframe),
                    'column': column,
                    'original_text': texts[i]
                }
                embeddings_list.append(embedding)
                metadata_list.append(metadata)

# turn results into a DataFrame for better visualization 
embeddings_df = pd.DataFrame(metadata_list)
embeddings_df['embedding'] = embeddings_list

# ceck results
print("Number of embeddings:", len(embeddings_list))
print("Embeddings DataFrame:\n", embeddings_df.head())



  from .autonotebook import tqdm as notebook_tqdm


Number of embeddings: 187
Embeddings DataFrame:
                              source column  \
0  organized_blow_molding_basics_df   text   
1  organized_blow_molding_basics_df   text   
2  organized_blow_molding_basics_df   text   
3  organized_blow_molding_basics_df   text   
4  organized_blow_molding_basics_df   text   

                                       original_text  \
0  Blow molding is defined as a plastic process w...   
1  Most thermoplastic resins in use in the plasti...   
2  There are three main processes used by the blo...   
3  In injection blow molding, melted plastic resi...   
4  Injection blow-molding requires two molds: one...   

                                           embedding  
0  [-0.028532026, -0.01720508, -0.01807146, -0.02...  
1  [0.012472105, -0.024527453, 0.03558379, -0.057...  
2  [0.006172009, 0.051811796, -0.01949948, -0.049...  
3  [-0.042652518, 0.0039179395, -0.049307454, -0....  
4  [0.013091768, -0.00049367774, -0.03917164, -0....  


In [13]:
collection.add(
    documents=[metadata['original_text'] for metadata in metadata_list], 
    embeddings=embeddings_list,  
    metadatas=metadata_list, 
    ids=[f"doc_{i}" for i in range(len(embeddings_list))]  # generate unique IDs based on embeddings length
)


In [14]:
# define query text, can be question or statement
query_text = "how to make chicken"

# model encodes the text to an embedding for the query
query_embedding = model.encode([query_text])

# search for items in our vector database with similarity
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=2  # chose 2 results of similar documents to retrieve to balance relevance vs efficiency
)

# Display the results
for doc, score, meta in zip(results['documents'], results['distances'], results['metadatas']):
    print(f"Document: {doc}\nScore: {score}\nMetadata: {meta}\n")


Document: ['1. Warm preforms rubbing against cooling shields in oven.\n2. Oven chain is too tight.', '1. Cooling shields to be adjusted to give adequate clearance.\n2. Reduce tension on oven chain.']
Score: [1.4601730108261108, 1.4629051685333252]
Metadata: [{'column': 'cause', 'original_text': '1. Warm preforms rubbing against cooling shields in oven.\n2. Oven chain is too tight.', 'source': 'organized_pet_bottle_faults_df'}, {'column': 'solution', 'original_text': '1. Cooling shields to be adjusted to give adequate clearance.\n2. Reduce tension on oven chain.', 'source': 'organized_pet_bottle_faults_df'}]

