In [1]:
!pip install langchain-community
!pip install tiktoken
!pip install faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

Dataset - https://www.kaggle.com/datasets/alistairking/weather-long-term-time-series-forecasting

In [2]:
import pandas as pd

data = pd.read_csv('/content/weather-forecast.csv')

In [3]:
data.columns

Index(['date', 'p', 'T', 'Tpot', 'Tdew', 'rh', 'VPmax', 'VPact', 'VPdef', 'sh',
       'H2OC', 'rho', 'wv', 'max. wv', 'wd', 'rain', 'raining', 'SWDR', 'PAR',
       'max. PAR', 'Tlog'],
      dtype='object')

Vector Store - These are like special filing cabinets for the number patterns (vectors) created by embedding models. When you want to find similar text later, the vector store helps you quickly find vectors that are close to each other. It's like organizing books by topic, but with numbers instead.

In [5]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from dotenv import load_dotenv

load_dotenv()

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
# Insert csv file into raw_document
raw_documents = TextLoader('/content/weather-forecast.csv').load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0, separator = "\n",)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())

  db = FAISS.from_documents(documents, OpenAIEmbeddings())


In [7]:
query = "Identify periods of high heat stress for crops based on the temperature and humidity data."
docs = db.similarity_search(query)
print(docs[0].page_content)

date,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,rho,wv,max. wv,wd,rain,raining,SWDR,PAR,max. PAR,Tlog


In [8]:
print(documents[1])

page_content='2020-01-01 00:10:00,1008.89,0.71,273.18,-1.33,86.1,6.43,5.54,0.89,3.42,5.49,1280.62,1.02,1.6,224.3,0.0,0.0,0.0,0.0,0.0,11.45' metadata={'source': '/content/weather-forecast.csv'}


Retrievers - These are like smart librarians. When you ask a question, they go to the vector store, find the most relevant information, and bring it back. They help fetch the most useful pieces of information based on what you're looking for.

In [9]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

retriever = db.as_retriever()

In [10]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

query = "How does the average daily temperature (T) vary across different time?"
qa.run(query)

  qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
  qa.run(query)


' The average daily temperature varies across different time, with the highest temperature being 23.15 on July 13th at 14:00 and the lowest temperature being 12.64 on August 28th at 06:20.'

In [11]:
query = "Find the lowest Air temperature (°C) recorded in the dataset and the corresponding date."
qa.run(query)

' The lowest Air temperature recorded in the dataset is -2.39°C on 2020-01-12 01:00:00.'

In [12]:
query = "Find maximum wind speed recorded in column max. wv"
qa.run(query)

' The maximum wind speed recorded in column max. wv is 4.03.'

In [13]:
query = "Which Atmospheric pressure (mbar) will be high?"
qa.run(query)

' The atmospheric pressure (mbar) that will be high is 1001.8.'