# RAG  based on mistral 7b

## Loading the model

In [1]:
!pip install -q torch datasets
!pip install -q accelerate==0.21.0 \
                peft==0.4.0 \
                bitsandbytes==0.40.2 \
                transformers==4.31.0 \
                trl==0.4.7

In [2]:
pip uninstall bitsandbytes

^C
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install --upgrade bitsandbytes

Defaulting to user installation because normal site-packages is not writeable
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-win_amd64.whl (101.6 MB)
     ------------------------------------- 101.6/101.6 MB 13.9 MB/s eta 0:00:00
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.40.2
    Uninstalling bitsandbytes-0.40.2:
      Successfully uninstalled bitsandbytes-0.40.2
Successfully installed bitsandbytes-0.43.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import transformers
import huggingface_hub

In [2]:
import bitsandbytes


In [9]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
     ---------------------------------------- 9.0/9.0 MB 19.2 MB/s eta 0:00:00
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp39-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 23.5 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
     ------------------------------------- 388.9/388.9 kB 23.7 MB/s eta 0:00:00
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.17.3
    Uninstalling huggingface-hub-0.17.3:
      Successfully uninstalled huggingface-hub-0.17.3
  Attempting uninstall: tokenizers
    Found existing installation: t



In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
#################################################################
# Tokenizer（pip install --user transformers==4.34.0）
#################################################################
model_name='mistralai/Mistral-7B-Instruct-v0.1'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [5]:
from transformers import BitsAndBytesConfig
import torch
#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [6]:
import torch

print("Torch version:",torch.__version__)

print("Is CUDA enabled?",torch.cuda.is_available())

Torch version: 2.1.0+cu121
Is CUDA enabled? True


In [7]:
from transformers import AutoModelForCausalLM
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## RAG based on mistral

#### Create LLM Chain

In [8]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
import transformers
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.2,
    return_full_text=True,
    max_new_tokens=700,
)
prompt_template = """
### [INST] 
Instruction: 
Extract and answer from your own knowledge. Ensure the response is concise, without duplicates, focusing solely on crucial details. 
             
           

### Example
Question: What is the sun?
Answer: A star at the center of the solar system.

Question: Who was the first person to walk on the moon?
Answer: Neil Armstrong.





### QUESTION:
{question} 

[/INST]
 """

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [9]:
import pandas as pd

# Assume llm_chain is already defined

def process_dataframe(file_path, output_file_path):
    # Load data from Excel file
    df = pd.read_excel(file_path)

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        # Extract context and question from each row, ensuring they are of string type
    
        question = str(row['Competency Question'])

        # Call the llm_chain method to get the result
        result = llm_chain.invoke({ "question": question})

        # Save the result in the 'Predicted Answer' column
        df.at[index, 'Predicted Answer'] = result

        # Print the obtained answer
        print(f"Answer for row {index + 1}: {result}")

        # Print the current row being processed to provide progress feedback
        print(f"Processed {index + 1} / {len(df)} rows")

    # Write the modified DataFrame back to a new Excel file
    df.to_excel(output_file_path, index=False)



# Processing the file
process_dataframe('C:/Users/crown/Desktop/ISE/praktikum docu/train_8/train_8.xlsx', 
                  'C:/Users/crown/Desktop/ISE/praktikum docu/train_8/train_result_mistral_withoutkg.xlsx')   
### These are the input file and out put file






Answer for row 1: {'question': 'Who is working in the Computational Materials Science field?', 'text': '\nThere are many scientists and researchers who work in the computational materials science field. Some notable figures include:\n\n* John Goodenough - a professor emeritus at Massachusetts Institute of Technology (MIT) and inventor of the lithium-ion battery.\n* Mildred Dresselhaus - a professor of physics and chemistry at MIT and recipient of the Nobel Prize in Physics in 2019 for her work on carbon nanotubes.\n* Shuji Nakamura - a professor of biology at Kyoto University and winner of the Nobel Prize in Physiology or Medicine in 2016 for his discovery of the molecular mechanism that controls stem cell self-renewal.\n* Jianqing Zhang - a professor of materials science and engineering at Stanford University and pioneer in the development of high-temperature superconductors.'}
Processed 1 / 38 rows




Answer for row 2: {'question': 'What are the research projects associated to EMMO?', 'text': '\nEMMO stands for "ElectroMagnetic Manipulation of Organisms." It refers to a field of study that focuses on using electromagnetic fields to manipulate living organisms such as cells, tissues, organs, and even entire organisms like animals or plants. Some potential research projects in this area could include studying how electromagnetic fields affect cellular processes, investigating the use of electromagnetic therapy for medical treatments, exploring the effects of electromagnetic pollution on human health, and examining the potential applications of electromagnetism in agriculture or environmental conservation efforts.'}
Processed 2 / 38 rows




Answer for row 3: {'question': 'Who are the contributors of the data "datasets"?', 'text': ' Datasets can be contributed by various individuals or organizations depending on their sources. Some common contributors include government agencies such as the National Oceanic and Atmospheric Administration (NOAA), the United States Geological Survey (USGS), and NASA; private companies like Google Earth Engine, Amazon Web Services Public Data Sets, and Microsoft Azure Open Datasets; academic institutions such as universities and research centers; non-profit organizations like the World Wildlife Fund (WWF) and The Nature Conservancy; and citizen scientists who collect and share data through platforms like iNaturalist and Zooniverse.'}
Processed 3 / 38 rows




Answer for row 4: {'question': 'Who is working with Researcher "Ebrahim Norouzi" in the same group?', 'text': " I don't have access to specific information about individuals or groups unless it has been previously shared with me. Can you provide more context or details about the researcher or their field of study?"}
Processed 4 / 38 rows




Answer for row 5: {'question': 'who is the email address of "ParaView"?', 'text': ' ParaView does not have an official email address as it is a software application used for data visualization and analysis. However, users can contact ParaView support through their website or by submitting a request via their online forum.'}
Processed 5 / 38 rows




Answer for row 6: {'question': 'What are the affilliations of Volker Hofmann?', 'text': '\nVolker Hofmann is a German politician who has been serving as the Minister for Economic Affairs, Energy, Transportation, and Digitalization in the state of Baden-Württemberg since May 2016. He belongs to the Christian Democratic Union (CDU) political party.'}
Processed 6 / 38 rows




Answer for row 7: {'question': 'What is "Molecular Dynamics" Software? List the programming language, documentation page, repository, and license information.', 'text': "\nMolecular dynamics (MD) software refers to a type of computer program used in physics and chemistry to simulate the motion of atoms or molecules over time using classical mechanics. It allows scientists to study complex systems that are difficult to analyze experimentally.\n\nThe most commonly used MD software include GROMACS, CHARMM, AMBER, and NAMD. These programs use different algorithms and force fields to calculate the energy and potential energies of the system being studied. The output of these simulations can be visualized using molecular graphics software such as Pymol, VMD, and Chimera.\n\nIt's important to note that while there are many open-source and proprietary MD software available, it requires specialized knowledge and expertise to use effectively. Additionally, the accuracy of the simulation results 



Answer for row 8: {'question': 'What are pre- and post-processing tools for MD simulations?', 'text': ' Pre-processing tools for MD simulations include molecular geometry optimization, topology generation, force field selection, and solvent addition. Post-processing tools include visualization software, trajectory analysis, energy minimization, and molecular dynamics simulation output analysis.'}
Processed 8 / 38 rows




Answer for row 9: {'question': 'What are some workflow environments for computational materials science?', 'text': '1. OpenFOAM: An open-source software suite used for simulation and analysis in various fields including computational fluid dynamics (CFD), heat transfer, combustion, and more.\n2. Abaqus: A commercial finite element analysis (FEA) software widely used in engineering and scientific applications such as structural mechanics, aerospace, automotive, and biomedical research.\n3. ANSYS Fluent: A CFD software that provides advanced features for modeling complex fluid flows, heat transfer, combustion, and chemical reactions.\n4. COMSOL Multiphysics: A multidisciplinary simulation platform with capabilities in fluid dynamics, heat transfer, electromagnetism, mechanical design, and more.\n5. LS-DYNA: A high-performance FEA software primarily used in automotive, aerospace, and energy industries for simulations involving large deformation, fatigue, impact, and nonlinear behavior.'}


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Answer for row 10: {'question': 'How should I cite pyiron?', 'text': '\nPyiron is a software package for simulating geological processes such as earthquakes, volcanic eruptions, and landslides. It can be cited using the following format:\n\nAuthor(s), Year. Title of paper or report. Journal name, volume number(issue number), page numbers. doi:xxxxxxxx/xxxxxxxx\n\nFor example:\n\nSmith, J., Johnson, K., & Lee, M. (2019). Simulation of seismic activity in urban areas using Pyiron. Geoscience Research Express, 6(3), 45-58. doi:10.1016/j.grxrep.2019.07.001'}
Processed 10 / 38 rows




Answer for row 11: {'question': 'Where can I find a list of interatomic potentials?', 'text': ' Interatomic potentials are typically found in tables or databases that provide information about the energy required for atoms to interact with each other. These tables are often used in chemistry and physics simulations to model atomic behavior. Some common sources include the National Institute of Standards and Technology (NIST) database, the JANAF tables, and the Chemical Society Reviews journal. Additionally, many software programs and libraries, such as Gaussian and ORCA, also contain pre-calculated interatomic potentials that can be accessed through their user interfaces.'}
Processed 11 / 38 rows




Answer for row 12: {'question': 'What are python libraries used for calculating local atomic structural environment?', 'text': ' Python libraries such as PySCF and GROMACS can be used for calculating local atomic structural environments in molecular biology.'}
Processed 12 / 38 rows




Answer for row 13: {'question': 'What are the electronic lab notebooks available?', 'text': '1. LabTalk\n2. Notebook Express\n3. Jupyter Notebook\n4. Google Docs\n5. Microsoft OneNote'}
Processed 13 / 38 rows




Answer for row 14: {'question': 'What are the software for Molecular Dynamics (MD)?', 'text': '1. GROMACS\n2. CHARMM\n3. AMBER\n4. VMD\n5. NAMD'}
Processed 14 / 38 rows




Answer for row 15: {'question': 'What are the ontologies in nanomaterials domain?', 'text': " Ontology refers to a formal representation of knowledge within a specific domain or field. In the context of nanomaterials, there are several ontologies that have been developed to organize and describe the properties and behaviors of these materials. Some examples include:\n\n1. The Nanotechnology Materials Database (NanoMatDB) ontology, which provides a standardized vocabulary for describing nanomaterials and their properties.\n2. The International Organization for Standardization (ISO) standards for nanoparticles, which define terms and concepts related to nanoparticle characterization and safety.\n3. The European Commission's Horizon 2020 program ontology for nanomedicine, which describes the various components and processes involved in developing nanomedical products.\n4. The National Institute of Standards and Technology (NIST) ontology for nanoscale materials, which includes information



Answer for row 16: {'question': 'What is DAMASK?', 'text': "\nDAMASK (Deep Automated Mapping System) is a software tool developed by NASA for mapping and analyzing lunar terrain data collected by rovers such as Lunar Reconnaissance Orbiter (LRO). It uses machine learning algorithms to automatically identify features in images taken by the rover's cameras, such as craters, rilles, and mountains, and creates detailed maps of the lunar surface. The resulting maps are used to plan future missions and explore new areas of interest on the Moon."}
Processed 16 / 38 rows




Answer for row 17: {'question': 'What are the data portals for materials science ontologies?', 'text': '1. Materials Project (<https://materialsproject.org/>)\n2. OpenCrystal Database (<http://www.opencryst.org/>)\n3. Crystal Structure Online (<http://crystallography.net/>)'}
Processed 17 / 38 rows




Answer for row 18: {'question': 'What are the instruments for APT?', 'text': '\nAPT stands for Advanced Placement Test. The instruments used for APT include textbooks, practice tests, review guides, online resources, and tutoring materials.'}
Processed 18 / 38 rows




Answer for row 19: {'question': 'In which institution can I find tomography equipment?', 'text': '\nTomography equipment can be found in medical institutions such as hospitals or clinics that offer diagnostic imaging services like CT scans, MRI scans, and PET scans. These institutions typically have dedicated departments for radiology and nuclear medicine where these types of equipment are housed and operated by trained professionals.'}
Processed 19 / 38 rows




Answer for row 20: {'question': 'What are the educational resources for Ontology?', 'text': '1. Textbooks: "Ontologies in Artificial Intelligence" by John McCarthy and Pat Hayes, "Artificial Intelligence with Applications in Engineering" by David Eberly, "Introduction to Knowledge Representation and Reasoning" by Peter J. Finkelman.\n2. Online Courses: Coursera\'s "Knowledge Representation and Inference," edX\'s "Artificial Intelligence Fundamentals."\n3. Seminars and Workshops: International Conference on Artificial Intelligence (ICAI), European Conference on Artificial Intelligence (ECAI).\n4. Professional Associations: Association for Computational Linguistics (ACL), American Association for Artificial Intelligence (AAAI).'}
Processed 20 / 38 rows




Answer for row 21: {'question': 'What is the API of Materials Project?', 'text': ' The API (Application Programming Interface) for Materials Project is a web-based service that provides access to materials data and computational tools for researchers and developers in various fields such as material science, chemistry, physics, and engineering. It allows users to search for materials based on their properties, perform calculations and simulations, and download relevant data files.'}
Processed 21 / 38 rows




Answer for row 22: {'question': 'Which simulation software have a python API?', 'text': '1. Unity3D\n 2. Unreal Engine\n 3. Blender'}
Processed 22 / 38 rows




Answer for row 23: {'question': 'What is the documentation of the "MatDB Online"?', 'text': ' MatDB Online is a database that provides information about materials used in research and industry. It contains data on over 10 million compounds, including their physical properties, chemical reactions, and applications. The documentation for MatDB Online includes user guides, technical specifications, and API documentation.'}
Processed 23 / 38 rows




Answer for row 24: {'question': 'What are the types of software licenses?', 'text': '1. Open-source license: Allows users to access, modify, and distribute source code freely. Examples include Apache License, MIT License, GPL License.\n2. Proprietary license: Restricts usage, modification, and distribution of software by requiring a paid or licensed copy. Examples include Microsoft Office, Adobe Creative Suite.\n3. Freeware license: Allows free use but may require registration or attribution. Examples include WinZip, CyberLink PowerDirector.\n4. Shareware license: Requires payment for full functionality after a trial period. Examples include Corel WordPerfect, Symantec Norton Antivirus.'}
Processed 24 / 38 rows




Answer for row 25: {'question': 'What are the software used to produce the data in the Materials Cloud repository?', 'text': ' The software used to produce the data in the Materials Cloud repository include various simulation tools such as Quantum Espresso, VASP, GROMACS, etc., along with post-processing analysis software like Matlab, Python, and R.'}
Processed 25 / 38 rows




Answer for row 26: {'question': 'What are datasets produced by the BAM organization?', 'text': " Datasets produced by the BAM (Broad Institute's Genomics Data Analysis Center) include genomic data for various organisms such as humans, mice, and plants. These datasets can be used for a variety of analyses including gene expression profiling, mutation analysis, and comparative genomics studies. The datasets are available in different formats and can be accessed through their website or other public repositories like NCBI."}
Processed 26 / 38 rows




Answer for row 27: {'question': 'What are some available datasets of mechanical properties of steels?', 'text': '1. The American Institute of Steel Construction (AISC) provides a dataset called "Steel Design Data" which includes information about steel properties such as strength, stiffness, and weight.\n2. The National Institute of Standards and Technology (NIST) has a database called "Material Properties Database" that contains data on various materials including steels.\n3. The Society of Automotive Engineers (SAE) offers a standard called SAE J6058_2017 which provides test methods for determining the mechanical properties of steels.\n4. The International Organization for Standardization (ISO) publishes standards related to steel properties, such as ISO 6506-1:2019 - Steel products - Mechanical properties - Part 1: General requirements.'}
Processed 27 / 38 rows




Answer for row 28: {'question': 'What are datasets related to "Transmission electron microscopy"?', 'text': ' Datasets related to Transmission Electron Microscopy (TEM) include images or data collected using TEM techniques such as high-resolution imaging, spectroscopy, tomography, and energy dispersive X-ray analysis. These datasets can be used for various purposes including material science research, biomedical studies, and structural analysis. Some examples of TEM datasets include atomic force microscope images, scanning tunneling microscope images, and cryo-electron microscopy images.'}
Processed 28 / 38 rows




Answer for row 29: {'question': 'What is the license of the dataset "Elastic Constant Demo Data"?', 'text': ' The license of the dataset "Elastic Constant Demo Data" is not specified in the given information.'}
Processed 29 / 38 rows




Answer for row 30: {'question': 'What is the repository for "BAM reference data"?', 'text': " The BAM (Binary Alignment Map) format is a compressed file format used in bioinformatics for storing aligned sequence reads against a reference genome. The repository for BAM reference data can vary depending on the source of the data, but commonly includes public databases such as UCSC Genome Browser, Ensembl, and NCBI's SRA (Sequence Read Archive). These repositories provide access to pre-computed BAM files that can be downloaded and used for various analyses."}
Processed 30 / 38 rows




Answer for row 31: {'question': 'What are the different data formats in the "BAM reference data"?', 'text': '\nThe BAM (Binary Alignment Map) reference data format is a compressed file format used for storing genomic sequence alignments. It contains several types of information about the alignment, including the coordinates of each read, its quality scores, and any other relevant metadata. The following are some of the common data formats found in BAM files:\n\n1. Sequence Reads: These are the individual DNA sequences that have been aligned against a reference genome. Each read has a unique identifier (ID), which allows it to be tracked throughout the alignment process.\n2. Quality Scores: These are numerical values assigned to each base call in a read, indicating how confident the sequencing instrument is in calling that particular nucleotide. Higher quality scores indicate greater confidence in the call.\n3. Mapping Information: This includes the start and end positions of each read 



Answer for row 32: {'question': 'What is the software version of "pacemaker"?', 'text': ' Pacemaker is a software tool used for managing clustered systems in Linux environments. It supports various operating systems such as Red Hat Enterprise Linux (RHEL), CentOS, Fedora, SUSE Linux Enterprise Server (SLES), and others. The specific version of Pacemaker can vary depending on the distribution and release it is installed with.'}
Processed 32 / 38 rows




Answer for row 33: {'question': 'What is the field of research "BAM reference data"?', 'text': ' BAM (Binary Alignment Map) reference data refers to a type of genomic data used in bioinformatics for mapping and analyzing DNA sequences. It contains information about the positions and orientations of reads or fragments of DNA sequenced using next-generation sequencing technologies relative to a reference genome. This data can be used to identify genetic variants, detect structural variations, and perform other types of analysis on genomic data.'}
Processed 33 / 38 rows




Answer for row 34: {'question': 'What is the description of the "BAM reference data"?', 'text': ' The BAM (Binary Alignment Map) reference data is a file format used in bioinformatics for storing genomic alignment information. It contains a compressed representation of genome sequences aligned against each other or against external references such as transcripts or proteins. This allows for efficient storage and retrieval of large amounts of genomic data.'}
Processed 34 / 38 rows




Answer for row 35: {'question': 'What are the datasets produced in 2022?', 'text': '1. The World Bank\'s Open Data Portal released several new datasets in 2022, including data on COVID-19 vaccinations, air pollution levels, and climate change indicators.\n\n 2. NASA launched a new dataset called "The Lunar Reconnaissance Orbiter (LRO) Camera Image Mosaic" which contains high resolution images of the lunar surface.\n\n 3. The European Space Agency (ESA) made available a new dataset containing information about the atmospheres of exoplanets discovered by ESA\'s PLATO mission.\n\n 4. The National Oceanic and Atmospheric Administration (NOAA) released a new dataset on ocean acidification levels in different regions of the world.'}
Processed 35 / 38 rows




Answer for row 36: {'question': 'Who is the creators of the "BAM reference data"?', 'text': ' The BAM (Binary Alignment Map) reference data is a standard format for storing genomic sequence alignments in a compressed binary format. It was developed by the Genome Reference Consortium (GRC), which is an international organization that coordinates efforts to improve the quality and accessibility of human genome assemblies and related resources.'}
Processed 36 / 38 rows




Answer for row 37: {'question': 'What are the datasets published by "BAM"?', 'text': '\n"BAM" stands for Berkeley Astronomy & Astrophysics Laboratory. They publish several datasets related to their research in astronomy and astrophysics. Some examples include:\n\n1. The Berkeley Open Sky Survey (BOSS): This dataset contains photometric measurements of over 850 million galaxies in the redshift range z = 0.2 to 3.4. It was used to map the distribution of matter in the universe and study galaxy evolution.\n2. The Berkeley Deep Sky Survey (BDSS): This survey covers a large portion of the visible sky and provides high-resolution images of stars, galaxies, and other celestial objects.\n3. The Berkeley Variable Star Database (BVSD): This database contains information about variable stars observed by amateur astronomers around the world. It includes data on light curves, spectra, and other properties of these stars.\n4. The Berkeley Cosmic Background Explorer (COBE) Data Archive: COBE was a sa



Answer for row 38: {'question': 'Average', 'text': '1. What is the average temperature in New York City during summer months?\n2. How many hours are there in a day?\n3. What is the capital city of France?\n4. What is the largest planet in our solar system?\n5. What is the smallest country in the world by land area?'}
Processed 38 / 38 rows


In [10]:
df=pd.read_excel('C:/Users/crown/Desktop/ISE/praktikum docu/train_8/train_result_mistral_withoutkg.xlsx')

In [11]:
df['Predicted_Answer'] = df['Predicted Answer'].astype(str)

In [12]:
import re

# Add a new column 'answer_text'
df['answer_text'] = ''

# Use regular expression to match the content after 'text':
text_pattern = re.compile(r"'text':(.+?)}", re.DOTALL)

# Iterate through DataFrame and extract content
for index, row in df.iterrows():
    predicted_answer = row['Predicted_Answer']
    match = text_pattern.search(predicted_answer)
    if match:
        # Extract matched content
        answer_text = match.group(1)
    else:
        answer_text = "no answer"  # Set default value if no match is found

    # Store the extracted content in the new column
    df.at[index, 'answer_text'] = answer_text


In [14]:

df['answer_text'] = df['answer_text'].str.replace('\n', ' ', regex=False)

df['answer_text'] = df['answer_text'].str.replace(r'\*\s*', ' ', regex=True)

df['answer_text'] = df['answer_text'].str.replace(r'\\n', ' ', regex=True)

df.to_excel('C:/Users/crown/Desktop/ISE/praktikum docu/train_8/train_result_mistral_withoutkg.xlsx', index=False)