# **Installation libraries**


In [1]:
!pip install -U langchain-community
!pip install faiss-cpu
!pip install sentence-transformers                                                       

Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-core<2.0.0,>=0.3.75 (from langchain-community)
  Downloading langchain_core-0.3.76-py3-none-any.whl.metadata (3.7 kB)
Collecting langchain<2.0.0,>=0.3.27 (from langchain-community)
  Downloading langchain-0.3.27-py3-none-any.whl.metadata (7.8 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting pydantic-settings<3.0.0,>=2.10.1 (from langchain-community)
  Downloading pydantic_settings-2.11.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.9 (from langchain<2.0.0,>=0.3.27->langchain-community)
  Downloading langchain_text_splitters-0.3.11-py3-none-any.whl.metadata (1.8 kB)
Collecting python-dotenv>=0.21.0 (from pyd

# **Import libraries**

In [3]:
import pandas as pd
import numpy as np
import os
from typing import List, Dict, Any
# LangChain
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM

In [4]:
# Load NASA Data
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)

    replacements = {
        'â€™': "'",
        'â€œ': '"',
        'â€': '"',
        'Â°': '°',
        'Â²': '²',
        'Â³': '³'
    }

    for old, new in replacements.items():
        text = text.replace(old, new)

    return text.strip()

# **Load NASA Data**

In [5]:
# Load Data
NASA = pd.read_csv("/kaggle/input/nasadata/NASA DATA.csv")
NASA.head()

# Clean the text columns
NASA['question'] = NASA['question'].apply(clean_text)
NASA['answer'] = NASA['answer'].apply(clean_text)

# Create Text Col
NASA['text'] = NASA['question'] + " " + NASA['answer']

# Select Columns
NASA = NASA[['category','question','answer','source','text']]
NASA

Unnamed: 0,category,question,answer,source,text
0,NASA,What is NASA?,NASA is the United States government agency re...,https://www.nasa.gov,What is NASA? NASA is the United States govern...
1,NASA,When was NASA founded?,"NASA was founded on July 29, 1958, through the...",https://www.nasa.gov,When was NASA founded? NASA was founded on Jul...
2,NASA,What does NASA stand for?,NASA stands for National Aeronautics and Space...,https://www.nasa.gov,What does NASA stand for? NASA stands for Nati...
3,NASA,What are NASA’s main goals?,"NASA’s goals include space exploration, Earth ...",https://www.nasa.gov,What are NASA’s main goals? NASA’s goals inclu...
4,ISS,What is the International Space Station (ISS)?,The ISS is a habitable artificial satellite an...,https://www.nasa.gov/iss,What is the International Space Station (ISS)?...
...,...,...,...,...,...
372,NASA,How are astronaut candidates evaluated during ...,"They are assessed on technical knowledge, team...",https://en.wikipedia.org/wiki/List_of_astronau...,How are astronaut candidates evaluated during ...
373,NASA,What swimming strokes are tested during astron...,"Candidates are tested using freestyle, backstr...",https://en.wikipedia.org/wiki/List_of_astronau...,What swimming strokes are tested during astron...
374,NASA,Do NASA astronauts train for spacewalk emergen...,"Yes, training covers contingencies such as sui...",https://en.wikipedia.org/wiki/List_of_astronau...,Do NASA astronauts train for spacewalk emergen...
375,NASA,Are astronauts trained for fire emergencies on...,"Yes, they undergo emergency training for fire,...",https://en.wikipedia.org/wiki/List_of_astronau...,Are astronauts trained for fire emergencies on...


In [5]:
NASA['text'][230], len(NASA['text'][230])

('How long did the Saffire fire safety experiments run? The Saffire experiments began in 2016 and concluded in 2024, spanning multiple missions.',
 142)

In [6]:
# Data Overview
display("Data size:", NASA.shape)
print('--------------------------------------------')
display("\n columns:", NASA.columns.tolist())
print('--------------------------------------------')
display("\n Data type:", NASA.dtypes)
print('--------------------------------------------')

'Data size:'

(377, 5)

--------------------------------------------


'\n columns:'

['category', 'question', 'answer', 'source', 'text']

--------------------------------------------


'\n Data type:'

category    object
question    object
answer      object
source      object
text        object
dtype: object

--------------------------------------------


In [7]:
NASA['category'].value_counts()

category
ISS         94
Research    90
NASA        79
EVA         71
NBL         43
Name: count, dtype: int64

# **RAG Documents Loading and Chunking**


In [6]:
# Create LangChain Document
LLM_Loader = DataFrameLoader(NASA, page_content_column="answer")
LLM_Data = LLM_Loader.load()
LLM_Data[1].model_dump()

{'id': None,
 'metadata': {'category': 'NASA',
  'question': 'When was NASA founded?',
  'source': 'https://www.nasa.gov',
  'text': 'When was NASA founded? NASA was founded on July 29, 1958, through the National Aeronautics and Space Act.'},
 'page_content': 'NASA was founded on July 29, 1958, through the National Aeronautics and Space Act.',
 'type': 'Document'}

In [9]:
len(LLM_Data)

377

# **Chunking**

In [7]:
# Text Splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
    separators=["\n", ".", " "]
)
Chunk = text_splitter.split_documents(LLM_Data)
Chunk

[Document(metadata={'category': 'NASA', 'question': 'What is NASA?', 'source': 'https://www.nasa.gov', 'text': 'What is NASA? NASA is the United States government agency responsible for space exploration, aeronautics research, and scientific discovery. It was established in 1958.'}, page_content='NASA is the United States government agency responsible for space exploration, aeronautics research, and scientific discovery. It was established in 1958.'),
 Document(metadata={'category': 'NASA', 'question': 'When was NASA founded?', 'source': 'https://www.nasa.gov', 'text': 'When was NASA founded? NASA was founded on July 29, 1958, through the National Aeronautics and Space Act.'}, page_content='NASA was founded on July 29, 1958, through the National Aeronautics and Space Act.'),
 Document(metadata={'category': 'NASA', 'question': 'What does NASA stand for?', 'source': 'https://www.nasa.gov', 'text': 'What does NASA stand for? NASA stands for National Aeronautics and Space Administration.'}

In [11]:
Chunk[0] ,' -----> ', len(Chunk)

(Document(metadata={'category': 'NASA', 'question': 'What is NASA?', 'answer': 'NASA is the United States government agency responsible for space exploration, aeronautics research, and scientific discovery. It was established in 1958.', 'source': 'https://www.nasa.gov'}, page_content='What is NASA? NASA is the United States government agency responsible for space exploration, aeronautics research, and scientific discovery. It was established in 1958.'),
 ' -----> ',
 377)

In [12]:
Chunk[165].page_content

'What is SAFER in EVA operations? The Simplified Aid for EVA Rescue is a small backpack with propulsion jets that allows astronauts to maneuver back to safety in emergencies.'

In [8]:
DOC_Text = [doc.page_content for doc in Chunk]
DOC_Text[20]

'Astronauts practice repairing ISS components, replacing hardware, installing upgrades, moving along handrails, and using specialized tools underwater.'

# **Embedding and Indexing**

In [9]:
# Create embeddings
Embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",
                                   model_kwargs={'device': 'cpu'},
                                   encode_kwargs={'normalize_embeddings': True})
Embeddings

  Embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [10]:
# Create Vector Database
Vector_Database = FAISS.from_documents(Chunk, Embeddings)
Vector_Database

<langchain_community.vectorstores.faiss.FAISS at 0x7947a02fc150>

In [11]:
# Test Search
Query_text = 'Can people with allergies apply for NASA astronaut selection'

sim_docs = Vector_Database.similarity_search(Query_text, k=1)
sim_docs

[Document(id='e90a4837-cdfc-4924-af42-150f16a5a66c', metadata={'category': 'NASA', 'question': 'Can people with allergies apply for NASA astronaut selection?', 'source': 'https://en.wikipedia.org/wiki/List_of_astronauts_by_year_of_selection', 'text': 'Can people with allergies apply for NASA astronaut selection? Severe allergies or conditions requiring regular medication may disqualify applicants.'}, page_content='Severe allergies or conditions requiring regular medication may disqualify applicants.')]

# **Create Retriever**

In [12]:
# Retriever
retriever = Vector_Database.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7947a02fc150>, search_kwargs={'k': 3})

# **Large Language Model (LLM)**

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",  
    trust_remote_code=True,
)

# pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.3,
    repetition_penalty=1.1
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [14]:
# Hugging Face Pipeline LangChine
LLM = HuggingFacePipeline(pipeline=pipe)
LLM

  LLM = HuggingFacePipeline(pipeline=pipe)


HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7947a2330610>)

In [15]:
# Create retriever
retriever = Vector_Database.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 2}
)
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7947a02fc150>, search_kwargs={'k': 2})

In [16]:
# Prompt Template
template = """
You are a NASA research assistant, specializing in NASA research, the International Space Station (ISS),
the Neutral Buoyancy Laboratory (NBL), and spaceflight.

Your task:
- Use ONLY the context provided to answer the question.
- If multiple pieces of information are found, choose ONLY the one most directly related to the question.
- Answer concisely in one sentence.
- If no relevant information exists, say "I'm not sure".

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:"""

prompt = PromptTemplate(
            input_variables=["context", "question"],
            template=template
        )
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nYou are a NASA research assistant, specializing in NASA research, the International Space Station (ISS),\nthe Neutral Buoyancy Laboratory (NBL), and spaceflight.\n\nYour task:\n- Use ONLY the context provided to answer the question.\n- If multiple pieces of information are found, choose ONLY the one most directly related to the question.\n- Answer concisely in one sentence.\n- If no relevant information exists, say "I\'m not sure".\n\nCONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nANSWER:')

In [17]:
# Build RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=LLM,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)
qa_chain

RetrievalQA(verbose=False, combine_documents_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nYou are a NASA research assistant, specializing in NASA research, the International Space Station (ISS),\nthe Neutral Buoyancy Laboratory (NBL), and spaceflight.\n\nYour task:\n- Use ONLY the context provided to answer the question.\n- If multiple pieces of information are found, choose ONLY the one most directly related to the question.\n- Answer concisely in one sentence.\n- If no relevant information exists, say "I\'m not sure".\n\nCONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nANSWER:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7947a2330610>), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variabl

In [19]:
# Test System
def ask_question(query):
    response = qa_chain.invoke({"query": query})

    answer = response.get("answer") or response.get("result") or response.get("output_text")

    print("Answer:", answer if answer else "⚠️ No answer key found in response.")
    print("\nSources:")

    unique_sources = set()
    for doc in response["source_documents"]:
        source = doc.metadata.get("source", "❓Unknown source")
        category = doc.metadata.get("category", "").strip()

        unique_sources.add((source, category))

    for i, (src, cat) in enumerate(unique_sources, 1):
        print(f"{i}. 🔗 {src} {'| Q: ' + cat if cat else ''}")

In [32]:
# How do astronauts learn to operate robotic systems
# How big is the NBL?
# What does EVA stand for?
# Can people with allergies apply for NASA astronaut selection
# How much professional experience is required for astronaut candidates
# What are the main strengths of NBL training
# How do astronauts move underwater in the NBL
# What are common astronaut tasks practiced underwater?
# What advantages does the ISS orbit provide for Earth observation?
    
###########################################################################################
    
# Tell me more about NASA
# Tell me more about NBL
# Tell me more about ISS
# Tell me more about Space

ask_question("What advantages does the ISS orbit provide for Earth observation?")

Answer: 
You are a NASA research assistant, specializing in NASA research, the International Space Station (ISS),
the Neutral Buoyancy Laboratory (NBL), and spaceflight.

Your task:
- Use ONLY the context provided to answer the question.
- If multiple pieces of information are found, choose ONLY the one most directly related to the question.
- Answer concisely in one sentence.
- If no relevant information exists, say "I'm not sure".

CONTEXT:
At 250 miles altitude, orbiting at 17,500 mph, the ISS provides detailed views of Earth’s features, weather, and disasters from multiple angles in both daylight and darkness.

The ISS provides access to long-term microgravity, exposure to space, a unique orbit, and hands-on operation by crew members.

QUESTION:
What advantages does the ISS orbit provide for Earth observation?

ANSWER:
The ISS's high altitude and fast orbital speed offer detailed views of Earth's features, weather, and disasters from multiple angles in both daylight and darkness fo

In [21]:
docs = retriever.get_relevant_documents("How much professional experience is required for astronaut candidates")
for i, d in enumerate(docs, 1):
    print(f"{i}. {d.page_content[:300]}...")
    print("Source:", d.metadata.get("source"))

1. Astronaut candidates study physics, mathematics, astronomy, orbital mechanics, meteorology, geology, oceanography, life sciences, technology, and engineering. They complete 16 specialized technical courses....
Source: https://www.nasa.gov/wp-content/uploads/2017/05/606877main_fs-2011-11-057-jsc-astro_trng.pdf
2. Astronaut training integrates applied sciences and human adaptation, focusing on precision, decision-making, and engineering execution across all mission stages....
Source: https://www.nasa.gov/


  docs = retriever.get_relevant_documents("How much professional experience is required for astronaut candidates")


# **Save Material**

In [None]:
# Save Vector Database
Vector_Database.save_local("VectorDatabase")

In [40]:
# Save Model
save_dir = "saved_model"
tokenizer.save_pretrained(save_dir)
model.save_pretrained(save_dir)



Saving checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })