In [1]:
import os
import logging
import pandas as pd
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import torch
import spacy
from transformers import AutoTokenizer, AutoModel, pipeline
from neo4j import GraphDatabase
from typing import Any, Dict
from pydantic import BaseModel, validator, Field
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import Ollama

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Function to load and preprocess data from various sources
def load_data():
    # Load CSV files
    united_dates_locations = pd.read_csv("data/united_dates_locations.csv")
    alliance_dates_locations = pd.read_csv("data/alliance_dates_locations.csv")
    air_canada_dates_locations = pd.read_csv("data/air_canada_dates_locations.csv")

    # Load text files
    with open("data/united_aircraft_details.txt", "r") as file:
        united_aircraft_details_content = file.read().split('\n\n')
    with open("data/alliance_aircraft_details.txt", "r") as file:
        alliance_aircraft_details_content = file.read().split('\n\n')
    with open("data/air_canada_aircraft_details.txt", "r") as file:
        air_canada_aircraft_details_content = file.read().split('\n\n')

    # Extract text from PDF files
    def extract_text_from_pdf(pdf_path):
        text = []
        with open(pdf_path, "rb") as file:
            reader = PdfReader(file)
            for page_num in range(len(reader.pages)):
                text.append(reader.pages[page_num].extract_text())
        return "\n".join(text)

    united_pdf_content = extract_text_from_pdf("data/united_accident_outcomes.pdf")
    alliance_pdf_content = extract_text_from_pdf("data/alliance_accident_outcomes.pdf")
    air_canada_pdf_content = extract_text_from_pdf("data/air_canada_accident_outcomes.pdf")

    # Combine data
    united_data = united_dates_locations['summary'].tolist() + united_aircraft_details_content + [united_pdf_content]
    alliance_data = alliance_dates_locations['summary'].tolist() + alliance_aircraft_details_content + [alliance_pdf_content]
    air_canada_data = air_canada_dates_locations['summary'].tolist() + air_canada_aircraft_details_content + [air_canada_pdf_content]

    return united_data, alliance_data, air_canada_data

# Load the data
united_data, alliance_data, air_canada_data = load_data()

In [3]:
# Configure logging
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

# Load models
nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
rel_extractor = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Load Hugging Face local embeddings
hugging_face_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize the generative model
generative_model = Ollama(model="gemma:7b")

# Define a prompt template
prompt_template = PromptTemplate(
    template="""
    You are an AI assistant that specializes in providing detailed information about airline accidents. 
    When given a query about a specific flight, you should:

    1. Identify the flight number and any other relevant details from the query.
    2. Retrieve specific information about the flight from the provided context, including any relevant accidents or incidents.
    3. Summarize the information in a clear and concise manner.
    4. If there are multiple incidents related to the flight, provide details on each incident separately.
    5. Ensure the response is focused on the specific flight mentioned in the query.

    Use the provided context to generate the response and avoid including unrelated information.

    Context:
    {context}
    """,
    input_variables=["context"]
)

  warn_deprecated(


In [4]:
def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Initialize document stores
class InMemoryDocstore:
    def __init__(self, documents):
        self.documents = documents

    def get_document(self, doc_id):
        return self.documents.get(doc_id, None)

In [5]:
def vectorize_and_index(data_list, index_path):
    embeddings = np.vstack([embed_text(text) for text in data_list])
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings.astype('float32'))
    faiss.write_index(index, index_path)

vectorize_and_index(united_data, "faiss_indexes/united_faiss.index")
vectorize_and_index(alliance_data, "faiss_indexes/alliance_faiss.index")
vectorize_and_index(air_canada_data, "faiss_indexes/air_canada_faiss.index")

# Initialize document stores
united_docstore = InMemoryDocstore({str(i): Document(page_content=text) for i, text in enumerate(united_data)})
alliance_docstore = InMemoryDocstore({str(i): Document(page_content=text) for i, text in enumerate(alliance_data)})
air_canada_docstore = InMemoryDocstore({str(i): Document(page_content=text) for i, text in enumerate(air_canada_data)})

def create_faiss_retriever(index_path, docstore, data_list):
    index = faiss.read_index(index_path)
    index_to_docstore_id = {i: str(i) for i in range(len(data_list))}
    return FAISS(embedding_function=embed_text, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

united_faiss_retriever = create_faiss_retriever("faiss_indexes/united_faiss.index", united_docstore, united_data)
alliance_faiss_retriever = create_faiss_retriever("faiss_indexes/alliance_faiss.index", alliance_docstore, alliance_data)
air_canada_faiss_retriever = create_faiss_retriever("faiss_indexes/air_canada_faiss.index", air_canada_docstore, air_canada_data)



In [6]:
class GraphDBClient:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters)
            return [record for record in result]

In [7]:
def query_knowledge_graph(client, keyword):
    query = """
    MATCH (e:Entity)-[r]->(related:Entity)
    WHERE e.name =~ $name OR related.name =~ $name
    RETURN e.name AS entity, type(r) AS relationship, related.name AS related_entity
    """
    try:
        result = client.query(query, parameters={"name": f"(?i).*{keyword}.*"})
        return [record for record in result]
    except Exception as e:
        logger.error(f"Error querying knowledge graph: {e}")
        return []

In [8]:
class CustomMultiRetriever(BaseModel):
    faiss_retrievers: Dict[str, FAISS] = Field(..., description="Dictionary of FAISS retrievers")
    knowledge_graph_client: Any

    @validator('faiss_retrievers')
    def check_faiss_retrievers(cls, value):
        if not value:
            raise ValueError("FAISS retrievers cannot be empty")
        return value

    class Config:
        arbitrary_types_allowed = True

    def get_relevant_documents(self, query):
        query_vector = embed_text(query).reshape(1, -1)
        results = {}
        for airline, retriever in self.faiss_retrievers.items():
            distances, indices = retriever.index.search(query_vector, k=10)
            docs = [retriever.docstore.get_document(str(i)) for i in indices[0] if retriever.docstore.get_document(str(i)) is not None]
            results[airline] = docs
        return results


/tmp/ipykernel_120478/726269644.py:5: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  @validator('faiss_retrievers')


In [9]:
def integrate_graph_and_docs(graph_results, retrieved_docs):
    integrated_context = "Retrieved Context:\n"
    if graph_results:
        for result in graph_results:
            entity = result.get('entity', 'Unknown Entity')
            relationship = result.get('relationship', 'Unknown Relationship')
            related_entity = result.get('related_entity', 'Unknown Related Entity')
            integrated_context += f"{entity} {relationship} {related_entity}. "
    else:
        integrated_context += "No relevant data found in the knowledge graph.\n"
    
    if retrieved_docs:
        integrated_context += "\nFrom FAISS:\n"
        for doc in retrieved_docs:
            integrated_context += doc.page_content + "\n"
    else:
        integrated_context += "No relevant data found in the FAISS retrievers.\n"
    
    return integrated_context

In [10]:
def extract_entities(query):
    doc = nlp(query)
    keywords = [ent.text for ent in doc.ents if ent.label_ in ("ORG", "GPE", "NORP", "PRODUCT", "EVENT")]
    return keywords

In [11]:
def process_query(user_query, custom_multi_retriever, graphdb_client, llm_chain):
    try:
        entities = extract_entities(user_query)
        logger.info(f"Extracted entities: {entities}")

        graph_context = ""
        for entity in entities:
            graph_results = query_knowledge_graph(graphdb_client, entity)
            if (len(graph_results) > 0):
                graph_context += integrate_graph_and_docs(graph_results, [])
            else:
                graph_context += f"No relevant data found in the knowledge graph for entity: {entity}.\n"

        faiss_docs = custom_multi_retriever.get_relevant_documents(user_query)

        final_context = graph_context + " ".join([doc.page_content for docs in faiss_docs.values() for doc in docs])

        response = llm_chain({"context": final_context})
        return response
    except Exception as e:
        logger.error(f"Error processing query: {e}")
        raise

In [12]:
# Initialize connection to Neo4j
graphdb_client = GraphDBClient(uri=os.getenv('NEO4J_URI'), user=os.getenv('NEO4J_USERNAME'), password=os.getenv('NEO4J_PASSWORD'))

custom_multi_retriever = CustomMultiRetriever(
    faiss_retrievers={
        "United Airlines": united_faiss_retriever,
        "Alliance Airlines": alliance_faiss_retriever,
        "Air Canada": air_canada_faiss_retriever
    },
    knowledge_graph_client=graphdb_client
)

# Initialize connection to Neo4j
graphdb_client = GraphDBClient(uri=os.getenv('NEO4J_URI'), user=os.getenv('NEO4J_USERNAME'), password=os.getenv('NEO4J_PASSWORD'))

# Create the LLMChain
llm_chain = LLMChain(prompt=prompt_template, llm=generative_model)

user_query = "Tell me about what aircraft FLIGHT 624 faced accident"
response = process_query(user_query, custom_multi_retriever, graphdb_client, llm_chain)
print(f"Response: {response['text']}")


  warn_deprecated(
  warn_deprecated(


In [16]:
user_query = "Tell me about what aircraft FLIGHT 328"
response = process_query(user_query, custom_multi_retriever, graphdb_client, llm_chain)
print(f"Response: {response['text']}")

Response: **Query:** Provide information about United Airlines Flight 328.

**Summary:**

United Airlines Flight 328 experienced two incidents:

**1. Right Engine Failure (Accident ID: 3)**

* Shortly after takeoff, the aircraft encountered a right engine failure.
* Debris from the engine fell onto residential areas below.
* The aircraft returned to Denver International Airport for an emergency landing.
* There were no injuries among the passengers or crew.


**2. Emergency Landing (Accident ID: 3)**

* The flight experienced an unspecified engine issue and declared an emergency landing.
* The aircraft returned to Denver International Airport without further incident.
* The incident led to inspections and modifications to similar engines to prevent future.

**Note:** This response provides a comprehensive summary of the incident.


'**Flight 328 Summary:**\n\nUnited Airlines Flight 328 experienced a right engine failure shortly after takeoff from Denver International Airport. Debris from the engine fell onto residential areas below. The aircraft returned to the airport for an emergency landing without injuries among the passengers or crew.\n\n**Specific Incidents:**\n\n**1. Engine Failure:**\n- The right engine malfunctioned shortly after takeoff.\n- Debris fell onto surrounding residential areas.\n\n**2. Emergency Landing:**\n- The flight returned to Denver International Airport for an emergency landing.\n- No injuries were reported among passengers or crew.'