# Retriving the data from OpenWeatherMap

In [10]:
from typing import List, Dict
import pandas as pd
import requests
from datetime import datetime , timedelta
def fetch_weather_data(city: str) -> Dict:
        """Fetch weather forecast data from OpenWeatherMap API"""
        url = f"http://api.openweathermap.org/data/2.5/forecast"
        params = {
            'q': city,
            'appid': 'YOUR_API_KEY',
            'units': 'metric'
        }
        response = requests.get(url, params=params)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch weather data: {response.status_code}")
        
        return response.json()

def process_weather_data(raw_data: Dict) -> pd.DataFrame:
        """Process raw weather data into a structured format"""
        processed_data = []
        city_name = raw_data['city']['name']
        country = raw_data['city']['country']
        for item in raw_data['list']:
            processed_item = {
                'datetime':datetime.fromtimestamp(item['dt']) ,
                'to':datetime.fromtimestamp(item['dt'])+timedelta(hours=3),
                'city': city_name,
                'country': country,
                'temperature': item['main']['temp'],
                'feels_like': item['main']['feels_like'],
                'humidity': item['main']['humidity'],
                'weather_desc': item['weather'][0]['description'],
                'wind_speed': item['wind']['speed'],
                'wind_direction': item['wind']['deg'],
                'precipitation': item.get('rain', {}).get('3h', 0),
                'pressure': item['main']['pressure'],
                'clouds': item['clouds']['all']
            }
            processed_data.append(processed_item)
            
        return pd.DataFrame(processed_data)

data = fetch_weather_data('Alger')
df = process_weather_data(data)
df.head()


Unnamed: 0,datetime,to,city,country,temperature,feels_like,humidity,weather_desc,wind_speed,wind_direction,precipitation,pressure,clouds
0,2024-10-25 10:00:00,2024-10-25 13:00:00,Algiers,DZ,18.94,19.16,87,light rain,1.33,278,0.26,1018,48
1,2024-10-25 13:00:00,2024-10-25 16:00:00,Algiers,DZ,21.09,21.19,74,scattered clouds,2.77,333,0.0,1018,45
2,2024-10-25 16:00:00,2024-10-25 19:00:00,Algiers,DZ,21.75,21.78,69,light rain,3.52,349,0.12,1016,20
3,2024-10-25 19:00:00,2024-10-25 22:00:00,Algiers,DZ,20.65,20.68,73,scattered clouds,1.9,78,0.0,1017,46
4,2024-10-25 22:00:00,2024-10-26 01:00:00,Algiers,DZ,20.74,20.7,70,overcast clouds,0.74,57,0.0,1016,88


# The retriver of the filters

In [26]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


def setup_llm(api_key):
    """
    Initialize the Gemini Pro LLM
    """
    llm = GoogleGenerativeAI(
        model="gemini-pro",
        google_api_key=api_key,
        temperature=0.1
    )
    return llm

def create_prompt_template():
    """
    Create a prompt template for querying weather data
    """
    template = """
    You are a weather data assistant. Based on the following date/time query, 
    help me find relevant weather information from a dataset.

    Query: {query}

    The data has the following format:
    - datetime: start time of the weather record
    - to: end time of the weather record
    - city: city name
    - country: country code
    - temperature: temperature in Celsius
    - feels_like: perceived temperature
    - humidity: humidity percentage
    - weather_desc: weather description
    - wind_speed: wind speed
    - wind_direction: wind direction in degrees
    - precipitation: precipitation amount
    - pressure: atmospheric pressure
    - clouds: cloud coverage percentage

    Extract the following information from the query:
    1. Start datetime (if mentioned)
    2. End datetime (if mentioned)
    3. Any specific weather attributes requested

    {format_instructions}
    """
    
    return PromptTemplate(
        input_variables=["query"],partial_variables={"format_instructions": get_output_parser().get_format_instructions()},template=template
    )

def get_output_parser():
    """
    Create a structured output parser for the LLM response
    """
    response_schemas = [
        ResponseSchema(name="start_datetime", 
                      description="The start datetime mentioned in the query, in YYYY-MM-DD HH:mm:ss format"),
        ResponseSchema(name="end_datetime", 
                      description="The end datetime mentioned in the query, in YYYY-MM-DD HH:mm:ss format"),
        ResponseSchema(name="attributes", 
                      description="List of specific weather attributes requested")
    ]
    
    return StructuredOutputParser.from_response_schemas(response_schemas)

def query_processor(query: str, llm, prompt, parser, df: pd.DataFrame):
    """
    Process a natural language query and retrieve relevant weather data
    """
    # Get LLM response
    _input = prompt.format_prompt(query=query)
    response = llm.invoke(_input.to_string())
    
    try:
        # Parse the structured response - modified to handle Gemini's response format
        parsed_output = parser.parse(response)  # Remove .text attribute access
        
        # Filter dataframe based on datetime range
        mask = pd.Series(True, index=df.index)
        
        if parsed_output.get('start_datetime'):
            start_dt = pd.to_datetime(parsed_output['start_datetime'])
            mask &= (df['datetime'] >= start_dt)
            
        if parsed_output.get('end_datetime'):
            end_dt = pd.to_datetime(parsed_output['end_datetime'])
            mask &= (df['to'] <= end_dt)
            
        filtered_df = df[mask]
        
        # Select specific columns if requested
        if parsed_output.get('attributes') and parsed_output['attributes']:
            # Always include datetime and city for context
            columns = ['datetime', 'to', 'city'] + parsed_output['attributes']
            columns = [col for col in columns if col in df.columns]
            filtered_df = filtered_df[columns]
            
        return filtered_df
    
    except Exception as e:
        print(f"Raw LLM response: {response}")  # Add this for debugging
        return f"Error processing query: {str(e)}"

def create_weather_rag_system(df: pd.DataFrame, api_key: str):
    """
    Create and return the complete RAG system
    """
    llm = setup_llm(api_key)
    prompt = create_prompt_template()
    parser = get_output_parser()
    
    def query_weather(query: str):
        return query_processor(query, llm, prompt, parser, df)
    
    return query_weather


# Initialize the system
api_key = "YOUR_GEMINI_API_KEY"
weather_rag = create_weather_rag_system(df, api_key)

# Example queries
queries = [
    "What was the weather like in Algiers on October 25, 2024 between 10 AM and 1 PM?"
    # "Show me temperature and humidity data for October 25, 2024",
    # "Get all weather information for October 25, 2024 morning"
]

for query in queries:
    result = weather_rag(query)
    print(f"\nQuery: {query}")
    print("Result:")
    print(result)


Query: What was the weather like in Algiers on October 25, 2024 between 10 AM and 1 PM?
Result:
             datetime                  to     city country  temperature  \
0 2024-10-25 10:00:00 2024-10-25 13:00:00  Algiers      DZ        18.94   

   feels_like  humidity weather_desc  wind_speed  wind_direction  \
0       19.16        87   light rain        1.33             278   

   precipitation  pressure  clouds  
0           0.26      1018      48  


# Putting it all together

In [28]:
class WeatherRAG:
    def __init__(self, api_key: str):
        """
        Initialize the Weather RAG system
        """
        self.api_key = api_key
        self.llm = self._setup_llm()
        self.retriever_prompt = self._create_retriever_prompt()
        self.qa_prompt = self._create_qa_prompt()
        self.parser = self._get_output_parser()
        
    def _setup_llm(self):
        """Initialize Gemini Pro LLM"""
        return GoogleGenerativeAI(
            model="gemini-pro",
            google_api_key=self.api_key,
            temperature=0.1
        )
    
    def _create_retriever_prompt(self):
        """Create prompt template for the retriever"""
        template = """
        You are a weather data assistant. Based on the following date/time query, 
        help me find relevant weather information from a dataset.

        Query: {query}

        The data has the following format:
        - datetime: start time of the weather record
        - to: end time of the weather record
        - city: city name
        - country: country code
        - temperature: temperature in Celsius
        - feels_like: perceived temperature
        - humidity: humidity percentage
        - weather_desc: weather description
        - wind_speed: wind speed
        - wind_direction: wind direction in degrees
        - precipitation: precipitation amount
        - pressure: atmospheric pressure
        - clouds: cloud coverage percentage

        Extract the following information from the query:
        1. Start datetime (if mentioned)
        2. End datetime (if mentioned)
        3. Any specific weather attributes requested

        {format_instructions}
        """
        return PromptTemplate(
            input_variables=["query"],
            partial_variables={"format_instructions": self._get_output_parser().get_format_instructions()},
            template=template
        )
    
    def _create_qa_prompt(self):
        """Create prompt template for question answering"""
        template = """
        You are a helpful weather assistant. Using the provided weather data, answer the user's question.
        
        Weather Data:
        {context}
        
        User Question: {question}
        
        Please provide a clear, informative answer that:
        1. Directly addresses the user's question
        2. Includes specific numbers and measurements when relevant
        3. Provides context about the weather conditions
        4. Highlights any notable patterns or changes
        
        Answer:
        """
        return PromptTemplate(
            input_variables=["context", "question"],
            template=template
        )
    
    def _get_output_parser(self):
        """Create structured output parser"""
        response_schemas = [
            ResponseSchema(name="start_datetime", 
                          description="The start datetime mentioned in the query, in YYYY-MM-DD HH:mm:ss format"),
            ResponseSchema(name="end_datetime", 
                          description="The end datetime mentioned in the query, in YYYY-MM-DD HH:mm:ss format"),
            ResponseSchema(name="attributes", 
                          description="List of specific weather attributes requested")
        ]
        return StructuredOutputParser.from_response_schemas(response_schemas)

    def fetch_weather_data(self, city: str) -> Dict:
        """Fetch weather forecast data from OpenWeatherMap API"""
        url = f"http://api.openweathermap.org/data/2.5/forecast"
        params = {
            'q': city,
            'appid': 'YOUR_API_KEY_OF_OPEN_WEATHER_MAP',  # Consider making this configurable
            'units': 'metric'
        }
        response = requests.get(url, params=params)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch weather data: {response.status_code}")
        return response.json()

    def process_weather_data(self, raw_data: Dict) -> pd.DataFrame:
        """Process raw weather data into a structured format"""
        processed_data = []
        city_name = raw_data['city']['name']
        country = raw_data['city']['country']
        
        for item in raw_data['list']:
            processed_item = {
                'datetime': datetime.fromtimestamp(item['dt']),
                'to': datetime.fromtimestamp(item['dt']) + timedelta(hours=3),
                'city': city_name,
                'country': country,
                'temperature': item['main']['temp'],
                'feels_like': item['main']['feels_like'],
                'humidity': item['main']['humidity'],
                'weather_desc': item['weather'][0]['description'],
                'wind_speed': item['wind']['speed'],
                'wind_direction': item['wind']['deg'],
                'precipitation': item.get('rain', {}).get('3h', 0),
                'pressure': item['main']['pressure'],
                'clouds': item['clouds']['all']
            }
            processed_data.append(processed_item)
        return pd.DataFrame(processed_data)

    def retrieve_relevant_data(self, query: str, df: pd.DataFrame):
        """Retrieve relevant weather data based on the query"""
        _input = self.retriever_prompt.format_prompt(query=query)
        response = self.llm.invoke(_input.to_string())
        
        try:
            parsed_output = self.parser.parse(response)
            
            # Filter dataframe based on datetime range
            mask = pd.Series(True, index=df.index)
            
            if parsed_output.get('start_datetime'):
                start_dt = pd.to_datetime(parsed_output['start_datetime'])
                mask &= (df['datetime'] >= start_dt)
                
            if parsed_output.get('end_datetime'):
                end_dt = pd.to_datetime(parsed_output['end_datetime'])
                mask &= (df['to'] <= end_dt)
                
            filtered_df = df[mask]
            
            return filtered_df
            
        except Exception as e:
            print(f"Retrieval error: {str(e)}")
            print(f"Raw LLM response: {response}")
            return df  # Return full dataset if parsing fails
    
    def generate_answer(self, question: str, context_df: pd.DataFrame):
        """Generate an answer based on the question and retrieved context"""
        context = context_df.to_string()
        _input = self.qa_prompt.format_prompt(
            context=context,
            question=question
        )
        
        try:
            response = self.llm.invoke(_input.to_string())
            return response
        except Exception as e:
            return f"Error generating answer: {str(e)}"
    
    def query(self, user_question: str, city: str):
        """
        Main query method that orchestrates the entire RAG process
        """
        try:
            # 1. Fetch fresh weather data
            raw_data = self.fetch_weather_data(city)
            df = self.process_weather_data(raw_data)
            
            # 2. Retrieve relevant data
            relevant_data = self.retrieve_relevant_data(user_question, df)
            
            # 3. Generate answer
            answer = self.generate_answer(user_question, relevant_data)
            
            return answer
            
        except Exception as e:
            return f"Error processing query: {str(e)}"

rag_system = WeatherRAG(api_key)

# Example questions
questions = [
    "What's the weather like in Algiers tomorrow morning?",
    "Will it rain in Algiers this weekend?",
    "What's the temperature trend for the next 24 hours?"
]

# Run queries
for question in questions:
    print(f"\nQuestion: {question}")
    answer = rag_system.query(question, city="Alger")
    print(f"Answer: {answer}")


Question: What's the weather like in Algiers tomorrow morning?
Answer: Tomorrow morning in Algiers, the weather is expected to be mostly clear with a temperature of around 19.48 degrees Celsius. The humidity will be around 70%, and the wind speed will be light, around 1.05 meters per second. There is no precipitation expected. Overall, the weather tomorrow morning in Algiers is expected to be pleasant and suitable for outdoor activities.

Question: Will it rain in Algiers this weekend?
Answer: Yes, there is a high probability of rain in Algiers this weekend. The weather forecast predicts light to moderate rain on Saturday, October 28th, with precipitation levels reaching up to 2.01 mm. On Sunday, October 29th, there is a 50% chance of light rain, with precipitation levels of around 0.50 mm. It's important to note that weather conditions can change, so it's always a good idea to check the latest forecast before making any plans.

Question: What's the temperature trend for the next 24 h

# Saving the retrived weather data

In [24]:
df.to_csv('weather_data.csv', index=False)