In [1]:

# sample from comments-energy.csv file to test
from pyprojroot import here
from pathlib import Path
import pandas as pd

data_dir = Path(here("Data"))

file_name = "comments-energy.csv"
file_path = data_dir / file_name
print(file_path)
df = pd.read_csv(file_path, dtype={'ProductID': str})
df.head()


/Users/arashkhajeh/GitHub/LLM-for-Materials/Data/comments-energy.csv


Unnamed: 0,ProductID,Structures,Article Title,General Comments
0,10359,,,Decomposes at 1750 C
1,10503,,,
2,10579,,,
3,10628,,,Transition to calcite at 520. Antacid
4,10646,,A new method of X-Ray crystal analysis,


In [2]:
# sample 10 lines randomly
df_sampled = df.sample(n=10, random_state=42)
file_name = data_dir / "comments-energy-sampled.csv"
df_sampled.to_csv(file_name, index=False)


In [3]:
df_sampled

Unnamed: 0,ProductID,Structures,Article Title,General Comments
16802,40125867,,ETUDE PAR DIFFRACTION NEUTRONIQUE DES SOLUTION...,
4489,730016,,,
9809,10908296,,,
14029,40065764,,X-ray diffraction data for graphite to 20 GPa,
4509,730037,,,Long GC comments (too long) also AN comment
9651,10903422,,Antiferromagnetic Kondo lattice compound Ce2 O...,
18967,40250040,,Structure refinements of superconducting and n...,
7661,10835954,,Neutron diffraction study of Li4 Ti5 O12 at lo...,
11520,40031682,,Transition Element - Rare Earth Compounds with...,
7527,10820157,,Crystal structure of the tetragonal supercondu...,


In [None]:

import sys
from pyprojroot import here
from timeit import default_timer as Timer

# Add the root of the project (where `src/` lives) to sys.path
sys.path.append(str(here()))
sys.path.append('../..')

from src.utils.prepare_vectordb_from_csv_xlsx import PrepareVectorDBFromTabularData

start = Timer()
# prepare the vector database from the sampled CSV file
from pyprojroot import here
# Specify the path to your CSV file directory below
comments_dir = here("data/comments-energy-sampled.csv")
# Create an instance of the PrepareVectorDBFromTabularData class with the file directory
data_prep_instance = PrepareVectorDBFromTabularData(file_directory=comments_dir)
# Run the pipeline to prepare and inject the data into the vector database
data_prep_instance.run_pipeline()
end = Timer()

print(f"time/vector: {(end - start)/10:.2f}")

Environment variables are loaded: True
comments-energy-sampled.csv


Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]

Data is stored in ChromaDB.
Number of vectors in vectordb: 10
time/vector: 0.09





In [5]:
from openai import OpenAI
import os
import chromadb
# Set the OpenAI API key
model_name = "gpt-35-turbo"
openai_api_key = os.environ["OPENAI_API_KEY"]

# instantiate the OpenAI client
client = OpenAI()

In [6]:
from src.utils.load_config import LoadConfig
cfg = LoadConfig()
print("tenant:", getattr(cfg.chroma_client, "tenant", None))
print("database:", getattr(cfg.chroma_client, "database", None))

collections = cfg.chroma_client.list_collections()
print("collections:", collections)
print("Available collections:")

tenant: default_tenant
database: default_database
collections: ['comments_energy']
Available collections:


In [7]:
from src.utils.load_config import LoadConfig

cfg = LoadConfig()

# Correct way to access the collection
vectordb = cfg.chroma_client.get_collection(name=cfg.collection_name)
print("Vector count:", vectordb.count())


Vector count: 10


In [8]:
# Define the query text and get the embeddings
query_texts = "which ProductID is related to transion elements and rare element compunds?"
response = client.embeddings.create(
        input = query_texts,
        model= "text-embedding-3-small"
    )
query_embeddings = response.data[0].embedding

In [9]:
query_embeddings

[0.00964705552905798,
 -0.008603058755397797,
 0.03028911165893078,
 0.026641732081770897,
 0.02398548647761345,
 -0.0006966859218664467,
 0.038614653050899506,
 0.017311839386820793,
 -0.01026155985891819,
 -0.056402236223220825,
 0.04051763191819191,
 0.004245365038514137,
 0.01884479634463787,
 -0.038508929312229156,
 0.004869780968874693,
 -0.010691052302718163,
 0.02500305324792862,
 -0.01182094682008028,
 -0.054129231721162796,
 0.04781239107251167,
 0.010466394014656544,
 0.0007970384322106838,
 0.005210070870816708,
 -0.006406041327863932,
 -0.005157209932804108,
 0.003402899717912078,
 -0.03322287276387215,
 0.02300756610929966,
 0.00934971496462822,
 0.006306927651166916,
 0.08077096194028854,
 -0.03782174363732338,
 0.012038996443152428,
 0.0007441778434440494,
 -0.03005123883485794,
 -0.0047574518248438835,
 0.026602085679769516,
 0.06163543462753296,
 -0.023932626470923424,
 0.007268329616636038,
 0.005973245482891798,
 0.01943947747349739,
 -0.008675741963088512,
 -0.0242

In [10]:
cfg.chroma_client.list_collections()

['comments_energy']

In [11]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

{'ids': [['id8']],
 'embeddings': None,
 'documents': [['ProductID: 040031682,\nStructures: nan,\nArticle Title: Transition Element - Rare Earth Compounds with the Cu5Ca Structure,\nGeneral Comments: nan']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'comments-energy-sampled'}]],
 'distances': [[0.7565889914030881]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [17]:
import pprint
pprint.pprint(results)

{'data': None,
 'distances': [[0.7565889914030881]],
 'documents': [['ProductID: 040031682,\n'
                'Structures: nan,\n'
                'Article Title: Transition Element - Rare Earth Compounds with '
                'the Cu5Ca Structure,\n'
                'General Comments: nan']],
 'embeddings': None,
 'ids': [['id8']],
 'included': [<IncludeEnum.distances: 'distances'>,
              <IncludeEnum.documents: 'documents'>,
              <IncludeEnum.metadatas: 'metadatas'>],
 'metadatas': [[{'source': 'comments-energy-sampled'}]],
 'uris': None}


In [18]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [19]:
response = client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

In [20]:
response.choices[0].message.content

'The ProductID related to transition elements and rare earth compounds is ProductID: 040031682. This product is associated with compounds with the Cu5Ca structure.'

In [21]:
df_sampled[df_sampled['ProductID'] == '040031682']

Unnamed: 0,ProductID,Structures,Article Title,General Comments
11520,40031682,,Transition Element - Rare Earth Compounds with...,


In [22]:
df_sampled[df_sampled['ProductID'] == '040031682'].to_dict(orient="records")

[{'ProductID': '040031682',
  'Structures': nan,
  'Article Title': 'Transition Element - Rare Earth Compounds with the Cu5Ca Structure',
  'General Comments': nan}]

### Make vector db for the whole dataset

In [None]:
import sys
from pyprojroot import here
from timeit import default_timer as Timer

# Add the root of the project (where `src/` lives) to sys.path
sys.path.append(str(here()))
sys.path.append('../..')
from src.utils.load_config import LoadConfig

cfg = LoadConfig()
import sys
from pyprojroot import here
from timeit import default_timer as Timer

from src.utils.prepare_vectordb_from_csv_xlsx import PrepareVectorDBFromTabularData

start = Timer()
# prepare the vector database from the whole comments CSV file
from pyprojroot import here
# Specify the path to your CSV file directory below
comments_dir = here("data/comments-energy.csv")
# Create an instance of the PrepareVectorDBFromTabularData class with the file directory
data_prep_instance = PrepareVectorDBFromTabularData(file_directory=comments_dir)
# Run the pipeline to prepare and inject the data into the vector database
data_prep_instance.run_pipeline()
end = Timer()

print(f"time/vector: {(end - start)/10:.2f}")

comments-energy.csv


Generating embeddings: 100%|██████████| 197/197 [03:01<00:00,  1.09it/s]


Data is stored in ChromaDB.
Number of vectors in vectordb: 19695
time/vector: 19.32


In [88]:
cfg.collection_name

'comments_energy'

In [89]:
from openai import OpenAI
import os
import chromadb
# Set the OpenAI API key
model_name = "gpt-35-turbo"
openai_api_key = os.environ["OPENAI_API_KEY"]

# instantiate the OpenAI client
client = OpenAI()

In [90]:
from src.utils.load_config import LoadConfig
cfg = LoadConfig()
print("tenant:", getattr(cfg.chroma_client, "tenant", None))
print("database:", getattr(cfg.chroma_client, "database", None))

collections = cfg.chroma_client.list_collections()
print("collections:", collections)
print("Available collections:")

cfg.chroma_client.list_collections()

tenant: default_tenant
database: default_database
collections: ['comments_energy']
Available collections:


['comments_energy']

In [91]:
import sys
from pyprojroot import here
# Add the root of the project (where `src/` lives) to sys.path
sys.path.append(str(here()))
from src.utils.load_config import LoadConfig

cfg = LoadConfig()

# Correct way to access the collection
vectordb = cfg.chroma_client.get_collection(name=cfg.collection_name)
print("Vector count:", vectordb.count())

Vector count: 19695


In [92]:
# Define the query text and get the embeddings
query_texts = "which ProductID is related to transion elements and rare element compunds?"
response = client.embeddings.create(
        input = query_texts,
        model= "text-embedding-3-small"
    )
query_embeddings = response.data[0].embedding
print("query_embeddings:", query_embeddings)

query_embeddings: [0.009640364907681942, -0.008556732907891273, 0.030288850888609886, 0.02666793391108513, 0.02397206611931324, -0.0007020485354587436, 0.038587890565395355, 0.01732490584254265, -0.01025486458092928, -0.056401751935482025, 0.040490854531526566, 0.004242024850100279, 0.018831418827176094, -0.038482170552015305, 0.004846612922847271, -0.010710783302783966, 0.025002839043736458, -0.01183406077325344, -0.05412876978516579, 0.04778555408120155, 0.01046630460768938, 0.0007677107350900769, 0.005206722300499678, -0.006429112516343594, -0.005110913421958685, 0.0034160856157541275, -0.033222589641809464, 0.02303379960358143, 0.009316597133874893, 0.006310177501291037, 0.08077026903629303, -0.037794988602399826, 0.01200585626065731, 0.0007321953307837248, -0.030050981789827347, -0.0048036640509963036, 0.02660185843706131, 0.06168776750564575, -0.023945637047290802, 0.007294697221368551, 0.005963282659649849, 0.019465740770101547, -0.00870209839195013, -0.024223152548074722, -0.04

In [107]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=100 #top_k
)

results

{'ids': [['id11520',
   'id11523',
   'id11522',
   'id11521',
   'id12379',
   'id12380',
   'id12378',
   'id13233',
   'id11516',
   'id10783',
   'id10207',
   'id10552',
   'id10233',
   'id12398',
   'id12439',
   'id12440',
   'id10551',
   'id10234',
   'id10553',
   'id10550',
   'id14158',
   'id10547',
   'id10222',
   'id12314',
   'id10548',
   'id10549',
   'id12305',
   'id13545',
   'id12397',
   'id12310',
   'id11412',
   'id13547',
   'id10300',
   'id12312',
   'id12313',
   'id13548',
   'id12308',
   'id12316',
   'id13551',
   'id13549',
   'id12307',
   'id13546',
   'id12319',
   'id14156',
   'id12315',
   'id12311',
   'id12304',
   'id12320',
   'id10158',
   'id12509',
   'id12120',
   'id12512',
   'id12318',
   'id12309',
   'id12317',
   'id8475',
   'id12306',
   'id12303',
   'id12511',
   'id12508',
   'id8476',
   'id12510',
   'id14684',
   'id11833',
   'id14162',
   'id14157',
   'id11832',
   'id14161',
   'id11416',
   'id11834',
   'id14159',
 

In [108]:
results["distances"]

[[0.7563989758491516,
  0.7598292231559753,
  0.7666701078414917,
  0.7706685662269592,
  0.8059663772583008,
  0.8067013025283813,
  0.8106697797775269,
  0.8178216218948364,
  0.8291753530502319,
  0.8344771862030029,
  0.8389734625816345,
  0.8444183468818665,
  0.8458280563354492,
  0.8462835550308228,
  0.8463038206100464,
  0.8470557928085327,
  0.8477517366409302,
  0.8491440415382385,
  0.8502905368804932,
  0.8506864905357361,
  0.8512499332427979,
  0.8515778183937073,
  0.8517701625823975,
  0.8527829051017761,
  0.8530617952346802,
  0.8532881140708923,
  0.853346586227417,
  0.8545611500740051,
  0.855506181716919,
  0.8568996787071228,
  0.8574472665786743,
  0.8583368062973022,
  0.8591820597648621,
  0.8593083620071411,
  0.8593249320983887,
  0.859732985496521,
  0.859915554523468,
  0.8601982593536377,
  0.860630452632904,
  0.8606844544410706,
  0.8609050512313843,
  0.8615474104881287,
  0.8616602420806885,
  0.8620018362998962,
  0.8621731996536255,
  0.86218392848

In [109]:
# For example, let's assume a cutoff similarity score of 0.8
SIMILARITY_CUTOFF = 0.8

# If results["distances"] are similarity scores:
filtered = [
    (doc, score)
    for doc, score in zip(results["documents"][0], results["distances"][0])
    if score >= SIMILARITY_CUTOFF
]

# Access the filtered docs and their scores
for doc, score in filtered:
    print(f"Score: {score}\nContent: {doc}\n---")

Score: 0.8059663772583008
Content: ProductID: 040040587,
Structures: nan,
Article Title: RARE-EARTH COBALT INTERMETALLIC COMPOUNDS,
General Comments: nan
---
Score: 0.8067013025283813
Content: ProductID: 040040592,
Structures: nan,
Article Title: RARE-EARTH COBALT INTERMETALLIC COMPOUNDS,
General Comments: nan
---
Score: 0.8106697797775269
Content: ProductID: 040040584,
Structures: nan,
Article Title: RARE-EARTH COBALT INTERMETALLIC COMPOUNDS,
General Comments: nan
---
Score: 0.8178216218948364
Content: ProductID: 040055049,
Structures: nan,
Article Title: Rare-Earth (and Yttrium)-Iridium and -Platinum Compounds with the Fe3C Structure Type,
General Comments: nan
---
Score: 0.8291753530502319
Content: ProductID: 040031599,
Structures: nan,
Article Title: Rare Earth Cobalt Compounds with the AB3 Structure,
General Comments: nan
---
Score: 0.8344771862030029
Content: ProductID: 040017233,
Structures: nan,
Article Title: THE PALLADIUM-RUTHENIUM SYSTEM,
General Comments: nan
---
Score: 0.8

In [110]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

response = client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

print(response.choices[0].message.content)

The ProductIDs related to transition elements and rare earth compounds are:
- ProductID: 040031682
- ProductID: 040031685
- ProductID: 040031684
- ProductID: 040031683

These ProductIDs are associated with Transition Element - Rare Earth Compounds with the Cu5Ca Structure.


In [111]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {filtered}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

response = client.chat.completions.create(
    model="gpt-4",
    messages=messages
)

print(response.choices[0].message.content)

The ProductIDs related to transition elements and rare element compounds are: 040040587, 040040592, 040040584, 040055049, 040031599, 040017233, 040010051, 040013463, 040010430, 040040672, 040041550, 040041551, 040013455, 040010435, 040013465, 040013441, 040064364, 040013392, 040010213, 040043255, 040013400, 040013440, 040043217, 040061607, 040040652, 040043240, 040030435, 040061609, 040011650, 040043247, 040043252, 040061611, 040043227, 040043257, 040061619, 040061612, 040043225, 040061608, 040043279, 040064362, 040043256, 040043241, 040043216, 040043281, 030655942, 040045970, 040040559, 040045974, 040043273, 040043239, 040043266, 010854568, 040043221, 040043215, 040045972, 040045967, 010854587, 040045971, 040071038, 040035729, 040064369, 040035727, 040064365, 040064366, 040030565, 040035736, 040064363, 040030635, 040040522, 040025185, 010715811, 040030805, 040052398, 040052400, 040052393, 040010849, 040133543, 040040549, 040052395, 040052394, 040052396, 040013212, 040040527, 040040531

In [112]:
### Pass the results to the OpenAI LLM / with gpt-4 model
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

response = client.chat.completions.create(
    model="gpt-4",
    messages=messages
)

print(response.choices[0].message.content)

The ProductIDs related to transition elements and rare element compounds are 040031682, 040031685, 040031684, and 040031683.


In [31]:
# exploring a few random samples
from pathlib import Path
import pandas as pd

data_dir = Path(here("data"))

file_path = data_dir / "comments-energy.csv"
print(file_path)
# Read the CSV file into a DataFrame
if file_path.is_file():
    df = pd.read_csv(file_path, dtype={'ProductID': str})
    df.sample(100, random_state=42)
else:
    print(f"File {file_path} does not exist.")
    df = pd.DataFrame()

/Users/arashkhajeh/GitHub/LLM-for-Materials/data/comments-energy.csv


In [34]:
# sample from the rows with general comments
df_sampled = df[df['General Comments'].notna()].sample(100, random_state=42).to_dict(orient="records")
df_sampled

[{'ProductID': '000700138',
  'Structures': nan,
  'Article Title': 'New oxide-ion conductor Sr Yb In O4 with partially cation-disordered Ca Fe2 O4-type structure',
  'General Comments': 'Ceramic oxide ion conductors have attracted attention owing to their many applications in solid oxide fuel cells, batteries, catalysts, gas sensors and oxygen separation membrances'},
 {'ProductID': '000240053',
  'Structures': nan,
  'Article Title': nan,
  'General Comments': 'Reported from Chile by Groth, Z. Kristallogr., 6 195 (1881)'},
 {'ProductID': '000630174',
  'Structures': nan,
  'Article Title': 'Lithium distribution in aluminum-free cubic Li7 La3 Zr2 O12',
  'General Comments': 'Investigated for use in Li-ion batteries'},
 {'ProductID': '000360909',
  'Structures': nan,
  'Article Title': nan,
  'General Comments': 'Data were taken from a single-crystal, which was grown by directional solidification'},
 {'ProductID': '000050570',
  'Structures': nan,
  'Article Title': nan,
  'General Com

# RAG tool design using LangChain

In [73]:
from langchain_core.tools import tool
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

@tool
def look_up_comments_table(query:str):
    """
    Look up the comments table for a specific query. Input should be a search query.
    """
    vectordb = Chroma(
    collection_name="comments_energy",
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-small"),
    persist_directory=str(here(data_dir) / "chroma"))

    docs = vectordb.similarity_search(query, k=5)
    return "\n\n".join([doc.page_content for doc in docs])

In [75]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

query_text = "which ProductID is related to transition elements and rare element compounds?"
res = look_up_comments_table(query_text)
print(res)


ProductID: 040031682,
Structures: nan,
Article Title: Transition Element - Rare Earth Compounds with the Cu5Ca Structure,
General Comments: nan

ProductID: 040031684,
Structures: nan,
Article Title: Transition Element - Rare Earth Compounds with the Cu5Ca Structure,
General Comments: nan

ProductID: 040031685,
Structures: nan,
Article Title: Transition Element - Rare Earth Compounds with the Cu5Ca Structure,
General Comments: nan

ProductID: 040031683,
Structures: nan,
Article Title: Transition Element - Rare Earth Compounds with the Cu5Ca Structure,
General Comments: nan

ProductID: 040041551,
Structures: nan,
Article Title: Laves Phases of the Rare Earths with Transition Elements,
General Comments: nan


In [84]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

query_text = "which entries have the transformation temperatures?"
res = look_up_comments_table(query_text)
print(res)


ProductID: 040035869,
Structures: nan,
Article Title: SUPERCONDUCTING TRANSITION TEMPERATURES OF THE SYSTEM V1-xTixHy,
General Comments: nan

ProductID: 000060612,
Structures: High temperature structure,
Article Title: nan,
General Comments: Transformation to hexagonal low temperature phase (Mg Zn2 type) occurs at 900-994° (Rostocker, J. Met., 5 304 (1953)

ProductID: 040060288,
Structures: nan,
Article Title: TEMPERATURES OF TRANSITION TO THE SUPERCONDUCTING STATE OF V3Ge, V3Al, AND SOLID SOLUTIONS OF V3GexAl1-x AND V3GexGa1-x,
General Comments: nan

ProductID: 010892924,
Structures: nan,
Article Title: High temperature allotropy and thermal expansion of the rare earth metals,
General Comments: Stable up to 1535 K

ProductID: 000430218,
Structures: nan,
Article Title: nan,
General Comments: High temperature modification. The temperature of β to α transition is near 1223 K


In [85]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=100 #top_k
)

results

AttributeError: 'Chroma' object has no attribute 'query'