# RAG  based on mistral 7b

## Loading the model

In [1]:
!pip install -q torch datasets
!pip install -q accelerate==0.21.0 \
                peft==0.4.0 \
                bitsandbytes==0.43.1\
                transformers==4.40.1\
                bitsandbytes==0.22.2
                trl==0.4.7



In [1]:
import transformers
import huggingface_hub
import bitsandbytes

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
#################################################################
# Tokenizer（pip install --user transformers==4.34.0）
#################################################################
model_name='mistralai/Mistral-7B-Instruct-v0.1'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [5]:
from transformers import BitsAndBytesConfig
import torch
#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [6]:
import torch

print("Torch version:",torch.__version__)

print("Is CUDA enabled?",torch.cuda.is_available())

Torch version: 2.1.0+cu121
Is CUDA enabled? True


In [7]:
from transformers import AutoModelForCausalLM
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## RAG based on mistral

#### Create LLM Chain

In [8]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
import transformers
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.2,
    return_full_text=True,
    max_new_tokens=700,
)
prompt_template = """
### [INST] 
Instruction: 
Extract and answer using key information from context. Ensure the response is concise, without duplicates, focusing solely on crucial details. 
             
           

### Example
Context: The sun is a star in the center of our solar system.
Question: What is the sun?
Answer: A star at the center of the solar system.

Context: Neil Armstrong was the first person to walk on the moon.
Question: Who was the first person to walk on the moon?
Answer: Neil Armstrong.



### CONTEXT
{context}

### QUESTION:
{question} 

[/INST]
 """

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [9]:
import pandas as pd

# Assume llm_chain is already defined

def process_dataframe(file_path, output_file_path):
    # Load data from Excel file
    df = pd.read_excel(file_path)

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        # Extract context and question from each row, ensuring they are of string type
        context = str(row['Context'])
        question = str(row['Competency Question'])

        # Call the llm_chain method to get the result
        result = llm_chain.invoke({"context": context, "question": question})

        # Save the result in the 'Predicted Answer' column
        df.at[index, 'Predicted Answer'] = result

        # Print the obtained answer
        print(f"Answer for row {index + 1}: {result}")

        # Print the current row being processed to provide progress feedback
        print(f"Processed {index + 1} / {len(df)} rows")

    # Write the modified DataFrame back to a new Excel file
    df.to_excel(output_file_path, index=False)



# Processing the file
process_dataframe('C:/Users/crown/Desktop/RAG mistral 7b v1/train_dataset.xlsx', 
                  'C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result.xlsx')  
### These are the input file and out put file




Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 1 的答案: {'context': 'Materials Design Ontology (MDO) has related Project Open Databases Integration for Materials Design (OPTIMADE); Materials Design Ontology (MDO) has documentation w3id.org/mdo/full/1.0/; Materials Design Ontology (MDO) has description "MDO is an ontology for materials design field, representing the domain knowledge specifically related to solid-state physics and computational materials science."; Open Materials Database has description "The Open Materials Database is a publicly available, searchable database for information on the properties of materials created by people who work in computational materials design research. "; PD Dr. habil.  Thomas Hammerschmidt has expertise in computational materials science; Dr Sarath Menon has expertise in computational materials science; Calphy has discipline computational materials science; Melting temperature computational workflow has discipline computational materials science; Pyscal has discipline computational materials 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 2 的答案: {'context': 'Elemental Multiperspective Material Ontology (EMMO) has related Project EMMC-CSA (2016-2019); Elemental Multiperspective Material Ontology (EMMO) has related Project SimDOME (2019-2023); Elemental Multiperspective Material Ontology (EMMO) has related Project MarketPlace (2018-2022); Elemental Multiperspective Material Ontology (EMMO) has related Project VIMMP (2018-2021); Elemental Multiperspective Material Ontology (EMMO) has related Project OntoTrans (2020-2024); Elemental Multiperspective Material Ontology (EMMO) has related Project ReaxPro (2019-2023); Elemental Multiperspective Material Ontology (EMMO) has related Project OntoCommons (2020-2023); Elemental Multiperspective Material Ontology (EMMO) has related Project OYSTER (2017-2021); Elemental Multiperspective Material Ontology (EMMO) has related Project NanoMECommons (2021-2025); Elemental Multiperspective Material Ontology (EMMO) has related Project OpenModel (2021-2025); Elemental Multiperspective Mater

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 3 的答案: {'context': 'datasets has contributor Fernández; datasets has contributor Prof. Felix Fritzen; datasets has contributor Oliver Weeger; datasets has contributor Frederik Larsson; datasets has description "This dataset was used in the work "Material modeling for parametric, anisotropic finite hyperelasticity based on machine learning with application in optimization of metamaterials" by M. Fernández, F. Fritzen and O. Weeger"; datasets has description "This dataset was used in the work "On-the-Fly Adaptivity for Nonlinear Twoscale Simulations Using Artificial Neural Networks and Reduced Order Modeling" by F. Fritzen, M. Fernández,  and F. Larsson"', 'question': 'Who are the contributors of the data "datasets"?', 'text': ' Contributors of the data "datasets": Fernández, Prof. Felix Fritzen, Oliver Weeger, Frederik Larsson'}
已处理 3 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 4 的答案: {'context': 'Thomas Pardoen has work package Institute of Mechanics; Thomas Pardoen has expertise in Institute of Mechanics; Thomas Pardoen has funding project  Institute of Mechanics; Thomas Pardoen related participant project is Institute of Mechanics; Thomas Pardoen has related Project Institute of Mechanics; Thomas Pardoen has discipline Institute of Mechanics; Thomas Pardoen has parent organisation  Institute of Mechanics; Thomas Pardoen has related task area Institute of Mechanics; Thomas Pardoen has documentation Institute of Mechanics; Thomas Pardoen has description Institute of Mechanics; Thomas Pardoen has  description Institute of Mechanics; Ebrahim Norouzi has work package FIZ Karlsruhe - Leibniz Institute for Information Infrastructure; Dr. Jörg Waitelonis has work package FIZ Karlsruhe - Leibniz Institute for Information Infrastructure; Dr.  Heike Fliegl has work package FIZ Karlsruhe - Leibniz Institute for Information Infrastructure; Prof. Dr. Harald Sack has w

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 5 的答案: {'context': 'data portal has comment "An online platform that provides access to data collections and datasets."; ParaView has website https://www.paraview.org/; ParaView has contact point support@kitware.com; ParaView has repository https://gitlab.kitware.com/paraview/paraview; ParaView has  description "ParaView is an open-source, multi-platform data analysis and visualization application based on Visualization Toolkit (VTK)."; dateCreated has comment "The date on which the CreativeWork was created or the item was added to a DataFeed."; dateCreated has domain resource; dataset has comment "A dataset is associated with a structured information about a resource."; about material has domain dataset; has creator has domain dataset', 'question': 'who is the email address of "ParaView"?', 'text': " ParaView's email address is not provided in the given context."}
已处理 5 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 6 的答案: {'context': 'Dr.  Volker Hofmann has affiliation with Forschungszentrum Jülich; PD Dr. Steffen Brinckmann has affiliation with Forschungszentrum Jülich; Dr.-Ing Abril Azocar Guzman has affiliation with Forschungszentrum Jülich; Prof. Dr. Ruth Schwaiger has affiliation with Forschungszentrum Jülich; Said Fathalla has affiliation with Forschungszentrum Jülich; Ahmad Zainul Ihsan has affiliation with Forschungszentrum Jülich; Dr.-Ing. Hanna Tsybenko has affiliation with Forschungszentrum Jülich; Prof. Dr. Stefan Sandfeld has affiliation with Forschungszentrum Jülich; PD Dr. Steffen Brinckmann has curation status Forschungszentrum Jülich; Dr.-Ing Abril Azocar Guzman has curation status Forschungszentrum Jülich; Prof. Dr. Ruth Schwaiger has curation status Forschungszentrum Jülich; Said Fathalla has curation status Forschungszentrum Jülich; Ahmad Zainul Ihsan has curation status Forschungszentrum Jülich; Dr.-Ing. Hanna Tsybenko has curation status Forschungszentrum Jülich; Prof. Dr

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 7 的答案: {'context': 'PubChem has citation Kim S. Exploring Chemical Information in PubChem. Curr. Protoc.; 2021 Aug 9; 1(8):e217. doi: https://doi.org/10.1002/cpz1.217.; PubChem has description "PubChem is the world\'s largest collection of freely accessible chemical information. Search chemicals by name, molecular formula, structure, and other identifiers. Find chemical and physical properties, biological activities, safety and toxicity information, patents, literature citations and more."; Computational Materials Repository (CMR) has related resource ASE; Computational Materials Repository (CMR) has description "Each CMR project consists of an ASE-database and a project page describing the data and showing examples of how to work with the data using Python and ASE."; Visual Molecular Dynamics has documentation https://www.ks.uiuc.edu/Research/vmd/current/ug/; Visual Molecular Dynamics has  description "molecular modelling and visualization computer program"', 'question': 'What is "M

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 8 的答案: {'context': 'OVITO basic has documentation https://www.ovito.org/docs/current/; OVITO basic has operating system  Linux; OVITO basic has operating system  Mac OS; OVITO basic has operating system  Microsoft Windows; OVITO basic has  description "visualization and analysis for data generated in MD, atomistic Monte-Carlo and other particle-based simulations"; Crystallography Open Database (COD) has description "Open-access collection of crystal structures of organic, inorganic, metal-organic compounds and minerals, excluding biopolymers."; AML has  description "Python package to automatically build the reference set for the training of Neural Network Potentials (NNPs), and eventually other machine-learned potentials, in an automated, data-driven fashion."', 'question': 'What are pre- and post-processing tools for MD simulations?', 'text': ' Pre-processing tools for MD simulations include OVITO basic, which has descriptions such as visualization and analysis for data generated in

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 9 的答案: {'context': 'Simmate has  description "The Simulated Materials Ecosystem (Simmate) is a toolbox and framework for computational materials research. It lets you explore various crystal databases, predict new materials, and quickly calculate properties."; matminer has  description "Python library for data mining the properties of materials."', 'question': 'What are some workflow environments for computational materials science?', 'text': '1. Simulate Materials Ecosystem (Simmate): This is a toolbox and framework that allows researchers to explore various crystal databases, predict new materials, and quickly calculate properties.\n2. Python library for data mining the properties of materials (matminer): This is another environment specifically designed for data mining the properties of materials.'}
已处理 9 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 10 的答案: {'context': 'Pyiron has  description "pyiron is an integrated development environment for implementing, testing, and running simulations in computational materials science."; Pyiron has citation title = {pyiron: An integrated development environment for computational materials science},_x000D_\n  journal = {Computational Materials Science},_x000D_\n  volume = {163},_x000D_\n  pages = {24 - 36},_x000D_\n  year = {2019},_x000D_\n  issn = {0927-0256},_x000D_\n  doi = {https://doi.org/10.1016/j.commatsci.2018.07.043},_x000D_\n  url = {http://www.sciencedirect.com/science/article/pii/S0927025618304786},_x000D_\n  author = {Jan Janssen and Sudarsan Surendralal and Yury Lysogorskiy and Mira Todorova and Tilmann Hickel and Ralf Drautz and Jörg Neugebauer},_x000D_\n  keywords = {Modelling workflow, Integrated development environment, Complex simulation protocols},; Pyrho has  description "Pyrho is a real-space DFT code written in Python. It is not built to be super-fast or scalable- i

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 11 的答案: {'context': 'Image based prediction of the heat conduction tensor has  description "An open-source code providing a graphical user interface to predict the effective heat conductivity of microstructures and requires only the image data as input. The deployed methods are proposed and validated in the related paper, doi: 10.3390/mca24020057."; Image based prediction of the heat conduction tensor has citation Data-Driven Microstructure Property Relations; Workshop: Workflows for atomistic simulations has description "Workshop lectures and videos from fitting of interatomic potentials to validating them."', 'question': 'Where can I find a list of interatomic potentials?', 'text': '\nThe provided context does not mention any specific location or source where you can find a list of interatomic potentials.'}
已处理 11 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 12 的答案: {'context': 'Materials Cloud has repository https://github.com/materialscloud-org; Materials Cloud Youtube channel required tool Materials Cloud; Materials Cloud has description "Materials Cloud is built to enable the seamless sharing and dissemination of resources in computational materials science, offering educational, research, and archiving tools; simulation software and services; and curated and raw data. These underpin published results and empower data-based discovery, compliant with data management plans and the FAIR principles."; Data Processing for Engineers and Scientists required tool python; Selected Chapters in Data Processing: Microstructure Analysis and Synthesis required tool python; Information Service Engineering required tool python; Materials Commons has repository https://github.com/materials-commons/materialscommons.org; Materials Commons has description "A site for Materials Scientists to collaborate, store and publish research. The Materials Commons 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 13 的答案: {'context': 'Laboratory Instrument has type nan; Atomic Simulation Environment has type software; Atomic Simulation Environment has type processing software; Atomic Simulation Environment has type research software; Atomic Simulation Environment has type visualization software; Atomic Simulation Environment uses method atomistic simulations; Atomic Simulation Environment has  description "set of tools and Python modules for setting up, manipulating, running, visualizing and analyzing atomistic simulations"; electronic lab notebook has type nan; Jupyter notebooks has type object type; Datasets for the analysis of dislocations at grain boundaries and during vein formation in cyclically deformed Ni micropillars has object type Jupyter notebooks; Experimental Workflow has type nan', 'question': 'What are the electronic lab notebooks available?', 'text': '\nThe electronic lab notebooks available are not specified in the given context.'}
已处理 13 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 14 的答案: {'context': 'dynasor has website https://dynasor.materialsmodeling.org/; dynasor has  description "dynasor is a simple tool for calculating total and partial dynamical structure factors as well as current correlation functions from molecular dynamics (MD) simulations."; Crystallography Open Database (COD) has website http://www.crystallography.net/cod/; Crystallography Open Database (COD) has description "Open-access collection of crystal structures of organic, inorganic, metal-organic compounds and minerals, excluding biopolymers."; Yet another force field has website https://molmod.ugent.be/software/yaff; Yet another force field has programming Language  python; Yet another force field has programming Language  C; Yet another force field has  description " pythonic force-field (FF) code used at the Center for Molecular Modeling (CMM) to test-drive new FF models"; Visual Molecular Dynamics has operating system  Linux; Visual Molecular Dynamics has operating system  Mac OS; V

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 15 的答案: {'context': 'Materials Design Ontology (MDO) has description "MDO is an ontology for materials design field, representing the domain knowledge specifically related to solid-state physics and computational materials science."; Materials Design Ontology (MDO) has semantic expressivity nan; MaterialsMine (MM) has description "A materials ontology to support data publication involving nanomaterials and metamaterials."; MaterialsMine (MM) has semantic expressivity nan', 'question': 'What are the ontologies in nanomaterials domain?', 'text': '1. Materials Design Ontology (MDO): Semantically expressive with focus on solid-state physics and computational materials science.\n2. MaterialsMine (MM): Semantically expressive with specific emphasis on nanomaterials and metamaterials.'}
已处理 15 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 16 的答案: {'context': 'Novel Materials Discovery (NOMAD) has type data portal; Novel Materials Discovery (NOMAD) has description "The Novel Materials Discovery (NOMAD) Laboratory maintains the largest worldwide repository for input and output files of all important computational materials science computer programs."; Elk has type software; Elk has type research software; Elk uses method Density functional theory; Elk has  description "An all-electron full-potential linearised augmented-plane wave (LAPW) code with many advanced features. Written originally at Karl-Franzens-Universität Graz as a milestone of the EXCITING EU Research and Training Network, the code is designed to be as simple as possible so that new developments in the field of density functional theory (DFT) can be added quickly and reliably. "; NOMAD has type funding project; Dusseldorf Advanced Material Simulation Kit has type software; Dusseldorf Advanced Material Simulation Kit has type research software; Dusseldorf A

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 17 的答案: {'context': 'Materials Design Ontology (MDO) has repository https://github.com/LiUSemWeb/Materials-Design-Ontology; Materials Design Ontology (MDO) has description "MDO is an ontology for materials design field, representing the domain knowledge specifically related to solid-state physics and computational materials science."; Computational Material Sample Ontology has repository https://github.com/Materials-Data-Science-and-Informatics/cmso-ontology; Computational Material Sample Ontology has contact point Dr.-Ing Abril Azocar Guzman; Computational Material Sample Ontology has description "CMSO is an ontology that aims to describe computational materials science samples (or structures), including crystalline defects."; MatPortal has website https://matportal.org/; MatPortal has description "Ontology repository for materials science."; Development of coupled ontologies and workflows for thermochemical treatments has main task area  Ontologies for Materials Science; Alignment 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 18 的答案: {'context': 'Crystallography Open Database (COD) has citation Gražulis, S., Daškevič, A., Merkys, A., Chateigner, D., Lutterotti, L., Quirós, M., Serebryanaya, N. R., Moeck, P., Downs, R. T. & LeBail, A. (2012). Crystallography Open Database (COD): an open-access collection of crystal structures and platform for world-wide collaboration. Nucleic Acids Research, 40, D420-D427. doi: 10.1093/nar/gkr900 (BibTeX, EndNote/Refer, plain text)\n\nGrazulis, S., Chateigner, D., Downs, R. T., Yokochi, A. T., Quiros, M., Lutterotti, L., Manakova, E., Butkus, J., Moeck, P. & Le Bail, A. (2009). Crystallography Open Database – an open-access collection of crystal structures. Journal of Applied Crystallography, 42, 726-729. doi: 10.1107/S0021889809016690 (BibTeX, EndNote/Refer, plain text); Crystallography Open Database (COD) has description "Open-access collection of crystal structures of organic, inorganic, metal-organic compounds and minerals, excluding biopolymers."; :n11f452d15b2c4c7492

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 19 的答案: {'context': 'nan', 'question': 'In which institution can I find tomography equipment?', 'text': '\n### ANSWER:\nTomography equipment can be found in medical institutions such as hospitals or clinics that offer diagnostic imaging services like CT scans, MRI scans, etc.'}
已处理 19 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 20 的答案: {'context': 'The MatWerk ontology has  description "The MatWerk ontology represents research data and related activities of the MSE community. A first version of the ontology will be simplified, focusing on (i) community structure: researchers, research projects, universities, and institutions; (ii) infrastructure: software, workflows, ontologies, schemas, APIs, instruments, facilities, educational resources; and (iii) data: repositories, databases, scientific publications, published datasets and reference data."; Dr. Amir Laadhar has expertise in ontology matching; Ontologies for Materials Science has description "This TA develops Ontologies and metadata schemes in close cooperation with the stakeholders, tailored to their respective needs."', 'question': 'What are the educational resources for Ontology?', 'text': '\nThere is no specific mention of educational resources for Ontology in the provided context.'}
已处理 20 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 21 的答案: {'context': 'HybriD³ materials database has website https://materials.hybrid3.duke.edu/; HybriD³ materials database has description "The HybriD³ materials database provides a comprehensive collection of experimental and computational materials data for crystalline organic-inorganic compounds, predominantly based on the perovskite paradigm."; Materials Cloud has repository https://github.com/materialscloud-org; Materials Cloud has website https://www.materialscloud.org/discover/mc3d/dashboard/ptable; Materials Cloud has description "Materials Cloud is built to enable the seamless sharing and dissemination of resources in computational materials science, offering educational, research, and archiving tools; simulation software and services; and curated and raw data. These underpin published results and empower data-based discovery, compliant with data management plans and the FAIR principles."; Python Materials Genomics has repository https://github.com/materialsproject/pymatgen

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 22 的答案: {'context': 'py4vasp has  description "Python interface to extract data from VASP calculations"; software has definition source "James Malone"; software has definition source "Modified in parts from https://en.wikipedia.org/wiki/Software"; software has definition source "Robert Stevens"; OpenPathSampling has  description "Python library to facilitate path sampling algorithms."; OpenPathSampling has citation David W.H. Swenson, Jan-Hendrik Prinz, Frank Noé, John D. Chodera, and Peter G. Bolhuis. “OpenPathSampling: A flexible, open framework for path sampling simulations. 1. Basics.” J. Chem. Theory Comput. 15, 813 (2019). https://doi.org/10.1021/acs.jctc.8b00626      David W.H. Swenson, Jan-Hendrik Prinz, Frank Noé, John D. Chodera, and Peter G. Bolhuis. “OpenPathSampling: A flexible, open framework for path sampling simulations. 2. Building and Customizing Path Ensembles and Sample Schemes.” J. Chem. Theory Comput. 15, 837 (2019). https://doi.org/10.1021/acs.jctc.8b00627 ',

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 23 的答案: {'context': 'Materials Platform for Data Science has description "Online materials database (known as PAULING FILE project) with nearly 2 million entries: physical properties, crystal structures, phase diagrams, available via API, ready for modern data-intensive applications. The source of these entries are about 300,000 peer-reviewed publications in materials science, processed during the last 16 years by an international team of PhD editors. The results are presented online with a quick search interface. The basic access is provided for free."; MatDB Online has documentation https://publications.jrc.ec.europa.eu/repository/handle/JRC75978; MatDB Online has citation Austin T, Over H. MATDB ONLINE—A STANDARDS-BASED SYSTEM FOR PRESERVING, MANAGING, AND EXCHANGING ENGINEERING MATERIALS TEST DATA. DATA SCIENCE JOURNAL 11; 2012. JRC75978; MatDB Online has description "MatDB Online facility is a Standards-based system for preserving, managing, and exchanging engineering materials 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 24 的答案: {'context': 'https://compositionspace.readthedocs.io/en/latest/ has type documentation; https://rdf.pyscal.org has type documentation; https://fairmat-experimental.github.io/nexus-fairmat-proposal/9636feecb79bb32b828b1a9804269573256d7696/index.html has type documentation; https://workflow-gallery.github.io/calphy/README.html has type documentation; https://github.com/DataAnalyticsEngineering/EQ2PC#readme has type documentation; http://feap.berkeley.edu/wiki/index.php?title=FEAP_Wiki_Main_Page has type documentation; https://www.paraview.org/documentation/ has type documentation; https://www.vasp.at/wiki/index.php/The_VASP_Manual has type documentation; http://www.castep.org/CASTEP/Documentation has type documentation; https://www.cpmd.org/wordpress/index.php/documentation/ has type documentation; https://docs.abinit.org/ has type documentation; https://www.quantum-espresso.org/documentation/ has type documentation; http://jdftx.org/Using.html has type documentation; https://m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 25 的答案: {'context': 'Materials Cloud has conforms to specification OPTIMADE; Materials Cloud Youtube channel required tool Materials Cloud; Materials Cloud has description "Materials Cloud is built to enable the seamless sharing and dissemination of resources in computational materials science, offering educational, research, and archiving tools; simulation software and services; and curated and raw data. These underpin published results and empower data-based discovery, compliant with data management plans and the FAIR principles."; Materials Commons has description "A site for Materials Scientists to collaborate, store and publish research. The Materials Commons is supported by the U.S. Department of Energy, Office of Basic Energy Sciences, Division of Materials Sciences and Engineering."; Materials Data Repository has description "MDR is a data repository to collect and store papers, presentation materials, and related materials data to accumulate and release them in a form suitab

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 26 的答案: {'context': 'Framework for curation and distribution of reference datasets related participant project is Bundesanstalt für Materialforschung und -prüfung (BAM); Ontologies for defects in crystals related participant project is Bundesanstalt für Materialforschung und -prüfung (BAM); NOMAD Metainfo has description "The NOMAD Metainfo stores descriptive and structured information about materials-science data contained in the NOMAD Archive"', 'question': 'What are datasets produced by the BAM organization?', 'text': ' Datasets produced by the BAM organization include reference datasets related to participant projects and ontologies for defects in crystals.'}
已处理 26 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 27 的答案: {'context': 'MatDat has description "Over 1000 detailed, fully referenced and verified datasets for steels, aluminium and titanium alloys, cast irons/steels, weld metals. Materials can be searched according to a number of different criteria. Initial search results are presented in the form of a table from which they can be selected for presentation in form of detailed report or for comparison overview (up to 5 materials). In addition to material information and values of properties/parameters, images of microstructure, specimens and those of stress-strain, stress- and strain-life curves (if available) can be reviewed as well."; Elastic Constant Demo has related resource Elastic Constant Demo Data; Elastic Constant Demo Data has related resource Elastic Constant Demo; Elastic Constant Demo Data has repository http://hdl.handle.net/21.11102/7f78d0ed-4855-4fdf-b764-0bea725f8821; Elastic Constant Demo Data has description "Data containing elastic constants of Al alloy"', 'questio

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 28 的答案: {'context': 'Atomic Simulation Recipes has related resource ASE; Atomic Simulation Recipes has  description "Recipes for Atomic Scale Materials Research. Collection of python recipes for common (and not so common)\ntasks perfomed in atomic scale materials research. These tasks include\nrelaxation of structures, calculating ground states, calculating band\nstructures, calculating dielectric functions and so on."', 'question': 'What are datasets related to "Transmission electron microscopy"?', 'text': '\nThere is no specific mention of datasets related to "Transmission electron microscopy" in the provided context.'}
已处理 28 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 29 的答案: {'context': 'NeXus Data Format has license LGPL-2.1 license; NeXus Data Format has documentation https://www.nexusformat.org/; NeXus Data Format has description "NeXus is a common data format for neutron, x-ray, and muon science. It is being developed as an international standard by scientists and programmers representing major scientific facilities in order to facilitate greater cooperation in the analysis and visualization of neutron, x-ray, and muon data."; Metadata schemes for materials science data has license Creative Commons Attribution 4.0 International; Metadata schemes for materials science data has description "Metadata schemes for materials science data in JSON representation as implemented in TTL representation in the application profiles in the version from 01.05.2022 of the research data management platform CoScInE. The schema are actively developed in the SFB1394 with the aim to construct defect phase diagrams in an automated fashion using all data from advanc

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 30 的答案: {'context': 'Carolina Materials Database has description "Welcome to Carolina Materials Database created in Jianjun Hu\'s and Ming Hu\'s groups at University of South Carolina. A freely, globally accessible database of 38,716 inorganic material compounds with over 80,000 calculated properties, and growing."', 'question': 'What is the repository for "BAM reference data"?', 'text': ' The repository for "BAM reference data" is not mentioned in the provided context.'}
已处理 30 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 31 的答案: {'context': 'Carolina Materials Database has description "Welcome to Carolina Materials Database created in Jianjun Hu\'s and Ming Hu\'s groups at University of South Carolina. A freely, globally accessible database of 38,716 inorganic material compounds with over 80,000 calculated properties, and growing."', 'question': 'What are the different data formats in the "BAM reference data"?', 'text': '\nThe BAM reference data format includes two types of files:.bam and.pdbq. These files contain information about the crystal structures of various materials, including their atomic coordinates, bond lengths, angles, and other relevant properties.'}
已处理 31 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 32 的答案: {'context': 'Qe-tools has programming Language  python; Qe-tools has  description "A set of useful tools for Quantum ESPRESSO"; eLabFTW has Version "4.3.9"; eLabFTW has  description "eLabFTW is an electronic lab notebook manager for research teams. It lets you store and organize your research experiments easily. It also features a database where any kind of objects (such as antibodies, plasmids, cell lines, boxes, etc.) can be stored. It is accessed via the browser"; pacemaker has Version "0.2.7"; pacemaker has  description "a tool for fitting of interatomic potentials in a general nonlinear Atomic Cluster Expansion (ACE) form. "', 'question': 'What is the software version of "pacemaker"?', 'text': '\nThe software version of "pacemaker" is "0.2.7".'}
已处理 32 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 33 的答案: {'context': ':n11f452d15b2c4c7492b23cc31220e483b21 has first element educational resource; :n11f452d15b2c4c7492b23cc31220e483b29 has first element educational resource; :n11f452d15b2c4c7492b23cc31220e483b33 has first element educational resource; Phonon database at Kyoto university has contact point Togo; Phonon database at Kyoto university has description "It’s getting to be a mainstream idea collecting a big set of ab-initio calculations and analyzing them by informatics approach. For scientific purpose, we plan to do it for phonon properties. As an attempt, we have run a set of ab-initio phonon calculations and left the raw data on this static web site. The aim of this web site is just leaving the data to distribute."; Dr. Rainer Stotzka has expertise in Research data infrastructure; Dr. Rossella Aversa has expertise in Research data infrastructure; Carolina Materials Database has description "Welcome to Carolina Materials Database created in Jianjun Hu\'s and Ming Hu\'s g

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 34 的答案: {'context': 'Carolina Materials Database has description "Welcome to Carolina Materials Database created in Jianjun Hu\'s and Ming Hu\'s groups at University of South Carolina. A freely, globally accessible database of 38,716 inorganic material compounds with over 80,000 calculated properties, and growing."', 'question': 'What is the description of the "BAM reference data"?', 'text': '\nThe BAM (Binary Accelerated Model) reference data is not mentioned in the provided context or question.'}
已处理 34 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 35 的答案: {'context': 'nan', 'question': 'What are the datasets produced in 2022?', 'text': "\nThere isn't enough context provided for me to accurately determine what specific datasets were produced in 2022. Can you please provide more information or clarify your question?"}
已处理 35 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 36 的答案: {'context': 'Electronic Structure Common Data Format (ESCDF) has description "This project gathers the working documents used to design the specifications\nof the Electronic Structure Common Data Format (ESCDF), an initiative of the\nElectronic Structure Library (ESL) Contributors.\n"; Carolina Materials Database has description "Welcome to Carolina Materials Database created in Jianjun Hu\'s and Ming Hu\'s groups at University of South Carolina. A freely, globally accessible database of 38,716 inorganic material compounds with over 80,000 calculated properties, and growing."', 'question': 'Who is the creators of the "BAM reference data"?', 'text': '\nThe provided text does not mention any creator for the "BAM reference data".'}
已处理 36 / 38 行


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


行 37 的答案: {'context': 'Framework for curation and distribution of reference datasets related participant project is Bundesanstalt für Materialforschung und -prüfung (BAM); Ontologies for defects in crystals related participant project is Bundesanstalt für Materialforschung und -prüfung (BAM); NOMAD Metainfo has documentation https://nomad-lab.eu/services/metainfo; NOMAD Metainfo has description "The NOMAD Metainfo stores descriptive and structured information about materials-science data contained in the NOMAD Archive"', 'question': 'What are the datasets published by "BAM"?', 'text': ' Datasets published by BAM include those related to their participant projects on frameworks for curation and distribution of reference datasets, as well as ontologies for defects in crystals.'}
已处理 37 / 38 行
行 38 的答案: {'context': 'nan', 'question': 'Average', 'text': '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\

In [10]:
df=pd.read_excel('C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result.xlsx')

In [11]:
df['Predicted_Answer'] = df['Predicted Answer'].astype(str)

In [12]:
import re

# Add a new column 'answer_text'
df['answer_text'] = ''

# Use regular expression to match the content after 'text':
text_pattern = re.compile(r"'text':(.+?)}", re.DOTALL)

# Iterate through DataFrame and extract content
for index, row in df.iterrows():
    predicted_answer = row['Predicted_Answer']
    match = text_pattern.search(predicted_answer)
    if match:
        # Extract matched content
        answer_text = match.group(1)
    else:
        answer_text = "no answer"  # Set default value if no match is found

    # Store the extracted content in the new column
    df.at[index, 'answer_text'] = answer_text


In [13]:

df['answer_text'] = df['answer_text'].str.replace('\n', ' ', regex=False)

df['answer_text'] = df['answer_text'].str.replace(r'\*\s*', ' ', regex=True)

df['answer_text'] = df['answer_text'].str.replace(r'\\n', ' ', regex=True)

df['answer_text'] = df['answer_text'].str.replace("'", "", regex=False)

df.to_excel('C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result.xlsx')

#### Evaluation

In [48]:
import pandas as pd
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util

# Define paths
excel_file_path = 'C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result.xlsx'
output_file_path = 'C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result_with_score.xlsx'

# Read Excel file
df = pd.read_excel(excel_file_path, usecols=['Competency Question', 'Context', 'Ground Truth', 'answer_text'])

# Initialize scorers
rouge = Rouge()
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare a dictionary to store scores
scores_dict = {
    "ROUGE-1-f": [], "ROUGE-1-p": [], "ROUGE-1-r": [],
    "ROUGE-2-f": [], "ROUGE-2-p": [], "ROUGE-2-r": [],
    "ROUGE-L-f": [], "ROUGE-L-p": [], "ROUGE-L-r": [],
    "BLEU": [],
    "SBERT-Similarity": []
}

# Calculate scores
for _, row in df.iterrows():
    ground_truth = str(row['Ground Truth']) if not pd.isnull(row['Ground Truth']) else ""
    predicted_answer = str(row['answer_text']) if not pd.isnull(row['answer_text']) else ""
    
    # Calculate ROUGE scores
    if ground_truth and predicted_answer:
        rouge_scores = rouge.get_scores(predicted_answer, ground_truth)[0]
        for key in scores_dict:
            if key.startswith("ROUGE"):
                rouge_type, rouge_metric = key.split('-')[1], key.split('-')[2]
                scores_dict[key].append(rouge_scores['rouge-{}'.format(rouge_type.lower())][rouge_metric])
    else:
        for key in scores_dict:
            if key.startswith("ROUGE"):
                scores_dict[key].append(None)
    
    # Calculate BLEU score
    reference = ground_truth.split()
    candidate = predicted_answer.split()
    bleu_score = sentence_bleu([reference], candidate, weights=(0.25, 0.25, 0.25, 0.25)) if ground_truth and predicted_answer else None
    scores_dict["BLEU"].append(bleu_score)
    
    # Calculate SBERT-Similarity score
    if ground_truth and predicted_answer:
        embeddings1 = sbert_model.encode([ground_truth], convert_to_tensor=True)
        embeddings2 = sbert_model.encode([predicted_answer], convert_to_tensor=True)
        cos_sim = util.pytorch_cos_sim(embeddings1, embeddings2)
        scores_dict["SBERT-Similarity"].append(cos_sim.item())
    else:
        scores_dict["SBERT-Similarity"].append(None)

# Add scores to DataFrame
for key, value in scores_dict.items():
    df[key] = value

# Save to Excel file
df.to_excel(output_file_path, index=False)

print(f"Scores have been calculated and the results are saved to: {output_file_path}")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'context': [Document(page_content='Averaging 45.3 receiving yards per game this season, Alvin Kamara has been a\nfocal point of the aerial attack with Derek Carr at quarterback. Kamara has\naveraged 42.8 receiving yards per game over his career but saw a dip in\nconsecutive years with Andy Dalton at quarterback. Now armed with an upgrade\nunder Center and multiple weapons at wideout to keep the defensive attention\noff him, Kamara is back to elite receiving numbers for a running back. Kamara\nhas recorded 36 or more receiving yards in four of his last five games and has\ntopped 33 or more in five of six games this season. He is being peppered with\ntargets regardless of opponent and should be leaned on again in Week 10. Tap\nthe More on Kamara for Week 10.', metadata={'source': 'https://www.fantasypros.com/2023/11/nfl-week-10-sleeper-picks-player-predictions-2023/', 'title': 'NFL Week 10 Sleeper Picks Player Predictions (2023) | FantasyPros', 'description': 'Welcome to Sleeper Picks. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load Excel file
output_file_path = 'C:/Users/crown/Desktop/RAG mistral 7b v1/v1_training_result_with_score.xlsx'
df = pd.read_excel(output_file_path)

# Specify columns to compute mean
columns_to_mean = [
    'BLEU', 'SBERT-Similarity',
    'ROUGE-1-f', 'ROUGE-1-p', 'ROUGE-1-r',
    'ROUGE-2-f', 'ROUGE-2-p', 'ROUGE-2-r',
    'ROUGE-L-f', 'ROUGE-L-p', 'ROUGE-L-r'
]

# Compute mean
means = df[columns_to_mean].mean()

# Visualization
ax = means.plot(kind='bar', figsize=(10, 6), color='skyblue', fontsize=13)

plt.title('Average Scores')
plt.xlabel('Metric')
plt.ylabel('Average Score')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.tight_layout()  # Automatically adjust subplot parameters to give specified padding

# Add values above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), f'{i.get_height():.2f}', ha='center', va='bottom')

plt.show()
