In [2]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from dotenv import load_dotenv

# Load the environment variables from the specified .env file
dotenv_path = '/home/zihan/Desktop/Manufacturing_QA/Experimental_and_Test/GraphRag/.env'
load_dotenv(dotenv_path)

# Load the OpenAI API key from the environment variable
api_key = os.getenv('GRAPHRAG_API_KEY')
assert api_key is not None, "OpenAI API key not found in environment variables."

In [2]:
required_exts = [".txt"]

documents = SimpleDirectoryReader("/home/zihan/Desktop/Manufacturing_QA/Experimental_and_Test/GraphRag/input",
                                    required_exts=required_exts,
                                    recursive=True).load_data()
print(f"Loaded {len(documents)} docs")

Loaded 35 docs


Since this is a single file uploaded, so we define the chunk size to make files smaller

In [4]:
os.environ["OPENAI_API_KEY"] = api_key

# # Build this from documents to create the index
index = VectorStoreIndex.from_documents(documents, chunk_size=600, chunk_overlap=100)
index.storage_context.persist() # Save the index to disk
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7a9963007290>

In [3]:
# ## To reload you can run 
from llama_index.core import StorageContext, load_index_from_storage

os.environ["OPENAI_API_KEY"] = api_key

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./storage")
# load index
index = load_index_from_storage(storage_context)

## Single result test

In [4]:
retriever = index.as_retriever(similarity_top_k=10)

In [5]:
results = retriever.retrieve("noncrystalline structure")
for result in results:
    print("#" * 100)
    print(result.text)

####################################################################################################
An amorphous material exhibits quite different behavior than that of a pure metal when it changes from solid to liquid, as shown in Figure 2.15. The process is again reversible, but observe the behavior of the amorphous material during cooling from the liquid state, rather than during melting from the solid, as before. Glass (silica, SiO2) is used to illustrate. At high temperatures, glass is a true liquid, and the molecules are free to move about as in the usual definition of a liquid. As the glass cools, it gradually transforms into the solid state, going through a transition phase, called a supercooled liquid, before finally becoming rigid. It does not show the sudden volumetric change that is characteristic of crystalline materials; instead it passes through its melting temperature Tm without a change in its thermal expansion slope. In this supercooled liquid region, the material be

In [14]:
llm = OpenAI(model="gpt-4o")
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

In [25]:
from llama_index.core import PromptTemplate


new_summary_tmpl_str = (
    """You are an top student in manufacturing major. You are in an exam and you need to answering the following MCQ based on the context provided and what you know regarding manufacturing process and materials correctly. 
Please think step by step then provide the choose the correct answer(s) carefully as yout finnal answer, remember you need to provide the letter choice. for the context choice will result a zero even it is correct:\n

You must provide answer in the following format:

"Explanation": "The capital of France is Paris, which is a major European city and a global center for art, fashion, and culture.", "YourChoice": "A"

{context_str}\n
Answer the following question:
Query: {query_str}\n
Answer: """
)

new_summary_tmpl = PromptTemplate(new_summary_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": new_summary_tmpl}
)

prompts_dict = query_engine.get_prompts()
print(prompts_dict['response_synthesizer:text_qa_template'].template)

You are an top student in manufacturing major. You are in an exam and you need to answering the following MCQ based on the context provided and what you know regarding manufacturing process and materials correctly. 
Please think step by step then provide the choose the correct answer(s) carefully as yout finnal answer, remember you need to provide the letter choice. for the context choice will result a zero even it is correct:


You must provide answer in the following format:

"Explanation": "The capital of France is Paris, which is a major European city and a global center for art, fashion, and culture.", "YourChoice": "A"

{context_str}

Answer the following question:
Query: {query_str}

Answer: 


In [26]:
## Single Test
result = query_engine.query("Approximately how many different elements have been identified (one answer)? (a) 10, (b) 50, (c) 100, (d) 200, or (e) 500.")
print(result.response)

"Explanation": "The text states that there are slightly more than 100 elements, not counting a few extras that have been artificially synthesized. This indicates that the number of identified elements is around 100.", "YourChoice": "c"


In [37]:
import pandas as pd
import glob

# Define the file path pattern to match all CSV files ending with -MCQ.csv
csv_file_pattern = '/home/zihan/Desktop/Manufacturing_QA/QA_Pairs/*-MCQ.csv'
# csv_file_pattern = '/home/zihan/Desktop/Manufacturing_QA/QA_Pairs/ch02-MCQ.csv'

# Use glob to find all files matching the pattern
csv_files = glob.glob(csv_file_pattern)

# Read all matching CSV files into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
df


Unnamed: 0,Question,Answer
0,Reasons why workparts must be cleaned include ...,"Answer. (a), (c), (d), and (e)."
1,Which of the following chemicals is associated...,"Answer. (a), (b)."
2,Shot peening is a mechanical cleaning method u...,Answer. (b) Principal function is to cold work...
3,"In sand blasting, which one of the following a...",Answer. (e)
4,"The abrasive media used in mass finishing, suc...","Answer. (a), (b), (c), (d), and (e)."
...,...,...
457,Steel cutting grades of cemented carbide are t...,Answer. (c) and (d).
458,If you had to select a cemented carbide for an...,Answer. (d)
459,Which of the following processes are used to p...,Answer. (a) and (c).
460,Which of the following materials has the highe...,Answer. (b)


In [38]:
questions = df['Question'].tolist()
results = []
for question in questions:
    result = query_engine.query(question)
    print(result)
    results.append(result.response)

Outputs_df = pd.DataFrame(results, columns=['llamaindex'])
Outputs_df

"Explanation": "Based on the provided text, the reasons why workparts must be cleaned include: (a) to prepare the surface for subsequent industrial processing, such as a coating application or adhesive bonding; (b) to improve hygiene conditions for workers and customers; (c) to remove contaminants that might chemically react with the surface; and (d) to enhance appearance and performance of the product.", "YourChoice": "A, C, D, E"
"Explanation": "Alkaline cleaning solutions consist of low-cost, water-soluble salts such as sodium and potassium hydroxide (NaOH, KOH), sodium carbonate (Na2CO3), borax (Na2B4O7), phosphates and silicates of sodium and potassium. Sulfuric acid and trichlorethylene are not used in alkaline cleaning; sulfuric acid is used in acid cleaning, and trichlorethylene is used in solvent cleaning.", "YourChoice": "A, B"
"Explanation": "Shot peening is a mechanical process that involves bombarding a metallic surface with small spherical media (shot) to induce compressi

Unnamed: 0,llamaindex
0,"""Explanation"": ""Based on the provided text, th..."
1,"""Explanation"": ""Alkaline cleaning solutions co..."
2,"""Explanation"": ""Shot peening is a mechanical p..."
3,"""Explanation"": ""Sand blasting, also known as a..."
4,"""Explanation"": ""The abrasive media used in mas..."
...,...
457,"""Explanation"": ""Steel-cutting grades of cement..."
458,"""Explanation"": ""For finish turning of steel, a..."
459,"""Explanation"": ""The text mentions that both ch..."
460,"""Explanation"": ""Based on the provided context ..."


In [39]:
results

['"Explanation": "Based on the provided text, the reasons why workparts must be cleaned include: (a) to prepare the surface for subsequent industrial processing, such as a coating application or adhesive bonding; (b) to improve hygiene conditions for workers and customers; (c) to remove contaminants that might chemically react with the surface; and (d) to enhance appearance and performance of the product.", "YourChoice": "A, C, D, E"',
 '"Explanation": "Alkaline cleaning solutions consist of low-cost, water-soluble salts such as sodium and potassium hydroxide (NaOH, KOH), sodium carbonate (Na2CO3), borax (Na2B4O7), phosphates and silicates of sodium and potassium. Sulfuric acid and trichlorethylene are not used in alkaline cleaning; sulfuric acid is used in acid cleaning, and trichlorethylene is used in solvent cleaning.", "YourChoice": "A, B"',
 '"Explanation": "Shot peening is a mechanical process that involves bombarding a metallic surface with small spherical media (shot) to induce

In [44]:
import re
df = Outputs_df

# Function to parse the Explanation and YourChoice
def parse_row(row):
    explanation_match = re.search(r'"Explanation": "(.*?)"', row)
    choice_match = re.search(r'"YourChoice": "(.*?)"', row)
    
    explanation = explanation_match.group(1) if explanation_match else ""
    choice = choice_match.group(1) if choice_match else ""
    
    return pd.Series([explanation, choice])

# Apply the parsing function to each row
df[['Explanation', 'YourChoice']] = df['llamaindex'].apply(parse_row)

In [45]:
df

Unnamed: 0,llamaindex,Explanation,YourChoice
0,"""Explanation"": ""Based on the provided text, th...","Based on the provided text, the reasons why wo...","A, C, D, E"
1,"""Explanation"": ""Alkaline cleaning solutions co...",Alkaline cleaning solutions consist of low-cos...,"A, B"
2,"""Explanation"": ""Shot peening is a mechanical p...",Shot peening is a mechanical process that invo...,B
3,"""Explanation"": ""Sand blasting, also known as a...","Sand blasting, also known as abrasive blasting...",E
4,"""Explanation"": ""The abrasive media used in mas...","The abrasive media used in mass finishing, suc...","A, B, D, E"
...,...,...,...
457,"""Explanation"": ""Steel-cutting grades of cement...",Steel-cutting grades of cemented carbide are t...,"A, C, D, E"
458,"""Explanation"": ""For finish turning of steel, a...","For finish turning of steel, a cemented carbid...",D
459,"""Explanation"": ""The text mentions that both ch...",The text mentions that both chemical vapor dep...,"A, C"
460,"""Explanation"": ""Based on the provided context ...",Based on the provided context and general know...,B


In [46]:
csv_file_path = '/home/zihan/Desktop/Manufacturing_QA/Test/llamaindex.csv'
df.to_csv(csv_file_path, index=False)

In [None]:
# Parse the results manually
parsed_results = []
for item in results:
    # Split the string into "Explanation" and "YourChoice" parts
    explanation_part = item.split('"YourChoice":')[0].strip().replace('"Explanation":', '').strip().strip('"')
    your_choice_part = item.split('"YourChoice":')[1].strip().strip().strip('"')
    
    # Reconstruct the dictionary
    parsed_dict = {
        "Explanation": explanation_part,
        "YourChoice": your_choice_part
    }
    parsed_results.append(parsed_dict)

# Convert the parsed results into a DataFrame
Outputs_df = pd.DataFrame(parsed_results)

# Display the DataFrame
print(Outputs_df)

In [42]:
df = pd.concat([df, Outputs_df], axis=1)
df

Unnamed: 0,Question,Answer,llamaindex
0,Reasons why workparts must be cleaned include ...,"Answer. (a), (c), (d), and (e).","""Explanation"": ""Based on the provided text, th..."
1,Which of the following chemicals is associated...,"Answer. (a), (b).","""Explanation"": ""Alkaline cleaning solutions co..."
2,Shot peening is a mechanical cleaning method u...,Answer. (b) Principal function is to cold work...,"""Explanation"": ""Shot peening is a mechanical p..."
3,"In sand blasting, which one of the following a...",Answer. (e),"""Explanation"": ""Sand blasting, also known as a..."
4,"The abrasive media used in mass finishing, suc...","Answer. (a), (b), (c), (d), and (e).","""Explanation"": ""The abrasive media used in mas..."
...,...,...,...
457,Steel cutting grades of cemented carbide are t...,Answer. (c) and (d).,"""Explanation"": ""Steel-cutting grades of cement..."
458,If you had to select a cemented carbide for an...,Answer. (d),"""Explanation"": ""For finish turning of steel, a..."
459,Which of the following processes are used to p...,Answer. (a) and (c).,"""Explanation"": ""The text mentions that both ch..."
460,Which of the following materials has the highe...,Answer. (b),"""Explanation"": ""Based on the provided context ..."


In [43]:
csv_file_path = '/home/zihan/Desktop/Manufacturing_QA/Test/all.csv'
df.to_csv(csv_file_path, index=False)