## Extract out the details from the user input through LLMs. 

In [31]:
import json
from pydantic import BaseModel, Field
from typing import List
import os
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

''' 
This script demonstrates how to use the langchain library to generate a detailed flight data of a random realistic user with a diverse background. The data from which you need to generate the flight data is as follows: I want to go through a flight from Beijing Capital International Airport to San Francisco International Airport  which should have departure time of 11 may 20204, Economy class, should have premium meal service and preferably boeing airplane. If some of the fields are not specified by the user, enter the value as 'Not specified'
'''

class FlightData(BaseModel):
    departure_airport: str = Field(
        description="The airport from which the flight is departing"
    )
    departure_time: str = Field(description="The departure time of the flight")
    arrival_airport: str = Field(
        description="The airport at which the flight is arriving"
    )
    airline: str = Field(description="The airline operating the flight")
    airplane: str = Field(description="The type of airplane for the flight")
    duration: int = Field(description="The duration of the flight in minutes")
    flight_number: str = Field(description="The flight number")
    travel_class: str = Field(description="The class of travel for the flight")
    extensions: List[str] = Field(
        description="Additional features or services available on the flight"
    )


# Instantiate the parser with the new model.
parser = PydanticOutputParser(pydantic_object=FlightData)

# Update the prompt to match the new query and desired format.
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
            "answer the users question as best as possible.\n{format_instructions}\n{question}"
        )
    ],
    input_variables=["question"],
    partial_variables={
        "format_instructions": parser.get_format_instructions(),
    },
)

chat_model = ChatOpenAI(
    model="gpt-3.5-turbo", openai_api_key=os.getenv("OPENAI_API_KEY"), max_tokens=1000
)

query_data = "I want to go through a flight from Beijing Capital International Airport to San Francisco International Airport  which should have departure time of 11 may 20204, Economy class, should have premium meal service and preferably boeing airplane"

# Generate the input using the updated prompt.
user_query = f"Generate a detailed flight data of a random realistic user with a diverse background, The data from which you need to generate the flight data is as follows: {query_data}. If some of the fields are not specified by the user, enter the value as 'Not specified'"
_input = prompt.format_prompt(question=user_query)

output = chat_model(_input.to_messages())

# Assuming 'output.content' is a JSON string
query = json.loads(output.content)

print(query)

{'departure_airport': 'Beijing Capital International Airport', 'departure_time': '11 May 20204', 'arrival_airport': 'San Francisco International Airport', 'airline': 'Not specified', 'airplane': 'Boeing', 'duration': 720, 'flight_number': 'Not specified', 'travel_class': 'Economy', 'extensions': ['Premium Meal Service']}


## Using google serp API to make requests to get the available flights.

In [32]:
import json
from serpapi import GoogleSearch
serp_api_key = os.getenv("SERP_API_KEY")

# Set the search parameters
params = {
    "type": "2",
    "engine": "google_flights",
    "departure_id": "PEK",
    "arrival_id": "AUS",
    "outbound_date": "2024-05-12",
    "api_key": serp_api_key,
}

# Perform the search
search = GoogleSearch(params)

# Get the results
results = search.get_dict()

# Extract the flight data
extracted_flights = []

# Extract the flight data
for index, flights in enumerate(results["best_flights"]):

    # Extract the flight data
    item = flights["flights"]

    # Add the flight data to the list
    for key, flight in enumerate(item):

        # Add the flight data to the list
        extracted_flights.append(
            {
                "departure_airport": flight["departure_airport"]["name"],
                "departure_time": flight["departure_airport"]["time"],
                "arrival_airport": flight["arrival_airport"]["name"],
                "airline": flight["airline"],
                "airplane": flight["airplane"],
                "duration": flight["duration"],
                "flight_number": flight["flight_number"],
                "travel_class": flight["travel_class"],
                "extensions": flight["extensions"],
            }
        )

# Extract the other flight data
for index, flights in enumerate(results["other_flights"]):

    # Extract the flight data
    item = flights["flights"]

    # Add the flight data to the list
    for key, flight in enumerate(item):

        # Add the flight data to the list
        extracted_flights.append(
            {
                "departure_airport": flight["departure_airport"]["name"],
                "departure_time": flight["departure_airport"]["time"],
                "arrival_airport": flight["arrival_airport"]["name"],
                "airline": flight["airline"],
                "airplane": flight["airplane"],
                "duration": flight["duration"],
                "flight_number": flight["flight_number"],
                "travel_class": flight["travel_class"],
                "extensions": flight["extensions"],
            }
        )

# Save the extracted flight data to a JSON file
with open("flights.json", "w") as json_file:
    json.dump(extracted_flights, json_file)

## Upsert the document into the pinecone vector database.

In [33]:
import openai
from pinecone import Pinecone
import os
import json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

client = OpenAI()
# Initialize OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")

# Initialize Pinecone
pinecone = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pinecone.Index("flight")

# Define your Pinecone index name
index_name = "flight"

# Load your JSON data
with open("flights.json", "r") as f:
    data = json.load(f)

# Create embeddings for the data
embeddings = []
for idx, item in enumerate(data):
    flight_data = {}
    duration = str()

    item = json.loads(json.dumps(item))
 
    
    flight_data["airline"] = item["airline"]
    flight_data["arrival_airport"] = item["arrival_airport"]
    flight_data["departure_airport"] = item["departure_airport"]
    flight_data["departure_time"]=item["departure_time"]
    flight_data["airplane"]= item["airplane"]
    flight_data["travel_class"] = item["travel_class"]
    flight_data["extensions"] = "".join(item["extensions"])
    if "overnight" in item:
        flight_data["overnight"] = "Yes" if item["overnight"] == True else "No"
    if "legreem" in item:
        flight_data["legroom"] = item["legroom"]
    flight_data["duration"] = f"Total duration is: {item["duration"]}"


   
    response = client.embeddings.create(
        input=json.dumps(flight_data), model="text-embedding-3-small"
    )
    # print(response)

    data = {}
    data["id"] = str(idx + 1)
    data["values"] = response.data[0].embedding
    data["metadata"] = {"flight": item["flight_number"], "departure_airport":item["departure_airport"], "arrival_airport":item["arrival_airport"]}

    embeddings.append(data)


# Upsert embeddings into Pinecone
index.upsert(vectors=embeddings)

{'upserted_count': 31}

## Make a similarity search of top k flights

In [34]:
# Query the index with the new data
response = client.embeddings.create(
    input=json.dumps(query), model="text-embedding-3-small"
)

# Get the query vector
query_vector = response.data[0].embedding

# Query the index with the new data
response = index.query(
    vector=query_vector,
    top_k=5,
    include_metadata=True,
    filter={
        "departure_airport": query["departure_airport"],
        "arrival_airport": query["arrival_airport"],
    },
)

print(response)
# Get the score of the matching vector
score = response["matches"][0]["score"]
print(f"Matching vector score: {score}")

{'matches': [{'id': '1',
              'metadata': {'arrival_airport': 'San Francisco International '
                                              'Airport',
                           'departure_airport': 'Beijing Capital International '
                                                'Airport',
                           'flight': 'UA 889'},
              'score': 0.669202209,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
Matching vector score: 0.669202209


In [35]:
# Sample query using a description
query = {
    "duration": 300,
    "airplane": "Not specific",
    "airline": "Not specific",
    "travel_class": "Economy",
    "flight_number": "Not specified",
    "legroom": "less than 50",
    "extensions": [
        "Average legroom less than 40",
        "Wi-Fi for a fee",
        "Carbon emission less than 500 kg",
    ],
    # "overnight": True,
}

response = client.embeddings.create(
    input=json.dumps(query), model="text-embedding-3-small"
)
query_vector = response.data[0].embedding

# Query the index with the new data
response = index.query(vector=query_vector, top_k=5, include_metadata=True)

print(response)
# Get the score of the matching vector
score = response["matches"][0]["score"]
print(f"Matching vector score: {score}")

{'matches': [{'id': '27',
              'metadata': {'arrival_airport': 'Hartsfield-Jackson Atlanta '
                                              'International Airport',
                           'departure_airport': 'Incheon International Airport',
                           'flight': 'DL 26'},
              'score': 0.823198855,
              'values': []},
             {'id': '25',
              'metadata': {'arrival_airport': 'Austin-Bergstrom International '
                                              'Airport',
                           'departure_airport': 'San Francisco International '
                                                'Airport',
                           'flight': 'UA 1116'},
              'score': 0.821313739,
              'values': []},
             {'id': '20',
              'metadata': {'arrival_airport': 'Austin-Bergstrom International '
                                              'Airport',
                           'departure_airport': 'Los Ang

In [36]:
# Sample query using a description
query = {
    "duration": 300,
    "travel_class": "Economy",
    "legroom": "less than 50",
    "extensions": [
        "Average legroom less than 40",
        "Video on demand",
        "Carbon emission more than 230 kg",
    ],
    # "overnight": True,
}

response = client.embeddings.create(
    input=json.dumps(query), model="text-embedding-3-small"
)
query_vector = response.data[0].embedding

# Query the index with the new data
response = index.query(vector=query_vector, top_k=5, include_metadata=True)

print(response)
# Get the score of the matching vector
score = response["matches"][0]["score"]
print(f"Matching vector score: {score}")

{'matches': [{'id': '27',
              'metadata': {'arrival_airport': 'Hartsfield-Jackson Atlanta '
                                              'International Airport',
                           'departure_airport': 'Incheon International Airport',
                           'flight': 'DL 26'},
              'score': 0.77244,
              'values': []},
             {'id': '28',
              'metadata': {'arrival_airport': 'Austin-Bergstrom International '
                                              'Airport',
                           'departure_airport': 'Hartsfield-Jackson Atlanta '
                                                'International Airport',
                           'flight': 'DL 1345'},
              'score': 0.765375435,
              'values': []},
             {'id': '17',
              'metadata': {'arrival_airport': 'Austin-Bergstrom International '
                                              'Airport',
                           'departure_airport':

In [37]:
# Sample query using a description
query = {
    "duration": 300,
    "travel_class": "Economy",
    "legroom": "less than 50",
    "extensions": [
        "Average legroom less than 40",
        "Internet should be free",
        "Carbon emission should be less than 230 kg",
    ],
    # "overnight": True,
}

response = client.embeddings.create(
    input=json.dumps(query), model="text-embedding-3-small"
)
query_vector = response.data[0].embedding

# Query the index with the new data
response = index.query(vector=query_vector, top_k=1, include_metadata=True)

print(response)
# Get the score of the matching vector
score = response["matches"][0]["score"]
print(f"Matching vector score: {score}")

{'matches': [{'id': '17',
              'metadata': {'arrival_airport': 'Austin-Bergstrom International '
                                              'Airport',
                           'departure_airport': 'Los Angeles International '
                                                'Airport',
                           'flight': 'DL 692'},
              'score': 0.749123693,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}
Matching vector score: 0.749123693
