In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from tqdm import tqdm


In [3]:
# Load and prepare initial data 
dataset_path = "/kaggle/input/climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByCity.csv"
df = pd.read_csv(dataset_path).dropna()
df = df[['dt', 'AverageTemperature', 'City', 'Country']]
df['dt'] = pd.to_datetime(df['dt'])
countries = ['Germany', 'France', 'United Kingdom', 'United States', 'Canada', 
             'Australia', 'India', 'China', 'Japan', 'Brazil']
df = df[df['Country'].isin(countries)]
df = df[(df['dt'].dt.year >= 1900) & (df['dt'].dt.year <= 2000)]

def calculate_trend(group):
    group = group.sort_values('dt')
    start_year = group['dt'].min().year
    end_year = group['dt'].max().year
    if len(group) >= 2:
        start_temp = group.iloc[0]['AverageTemperature']
        end_temp = group.iloc[-1]['AverageTemperature']
        trend = end_temp - start_temp
        return pd.Series({'start_year': start_year, 'end_year': end_year, 'trend': trend})
    return pd.Series({'start_year': None, 'end_year': None, 'trend': None})

trend_df = df.groupby(['City', 'Country']).apply(calculate_trend, include_groups=False).reset_index()
trend_df = trend_df.dropna()

instructions = []
responses = []
for _, row in trend_df.iterrows():
    instruction = f"What is the temperature trend in {row['City']}, {row['Country']} from {int(row['start_year'])} to {int(row['end_year'])}?"
    response = f"The temperature trend in {row['City']}, {row['Country']} from {int(row['start_year'])} to {int(row['end_year'])} was {row['trend']:.2f}°C"
    instructions.append(instruction)
    responses.append(response)

instruction_response_df = pd.DataFrame({'instruction': instructions, 'response': responses})


In [4]:

# 1. Prepare training data
def prepare_training_data(instruction_response_df):
    expanded_data = []
    for _, row in instruction_response_df.iterrows():
        city_country_match = re.search(r'in ([^,]+), ([^,]+) from (\d+) to (\d+)', row['instruction'])
        if city_country_match:
            city = city_country_match.group(1)
            country = city_country_match.group(2)
            start_year = city_country_match.group(3)
            end_year = city_country_match.group(4)
            temp_match = re.search(r'was ([-+]?\d+\.\d+)°C', row['response'])
            if temp_match:
                temp_trend = temp_match.group(1)
                variations = [
                    {"input": f"What is the temperature trend in {city}, {country} from {start_year} to {end_year}?",
                     "output": f"The temperature trend in {city}, {country} from {start_year} to {end_year} was {temp_trend}°C."},
                    {"input": f"What's the temperature trend in {city}, {country} between {start_year} and {end_year}?",
                     "output": f"Between {start_year} and {end_year}, {city}, {country} experienced a temperature change of {temp_trend}°C."},
                    {"input": f"How did temperatures change in {city}, {country} from {start_year}–{end_year}?",
                     "output": f"In {city}, {country}, temperatures changed by {temp_trend}°C from {start_year} to {end_year}."},
                    {"input": f"From {start_year} to {end_year}, what was the temperature trend in {city}, {country}?",
                     "output": f"From {start_year} to {end_year}, {city}, {country} saw a temperature trend of {temp_trend}°C."}
                ]
                expanded_data.extend(variations)
    
    return pd.DataFrame(expanded_data)

In [5]:
# 2. Prepare dataset for training
def prepare_dataset(df, tokenizer, max_length=128):
    inputs = df['input'].tolist()
    outputs = df['output'].tolist()
    input_ids_list = []
    attention_masks_list = []
    labels_list = []
    
    tokenizer.pad_token = tokenizer.eos_token
    half_length = max_length // 2
    
    for input_text, output_text in zip(inputs, outputs):
        input_text = f"input: {input_text} output:"
        input_encoding = tokenizer(input_text, max_length=half_length, truncation=True, padding='max_length', return_tensors="pt")
        output_encoding = tokenizer(output_text, max_length=half_length, truncation=True, padding='max_length', return_tensors="pt")
        
        input_ids = torch.cat([input_encoding['input_ids'].squeeze(0), output_encoding['input_ids'].squeeze(0)])
        attention_mask = torch.cat([input_encoding['attention_mask'].squeeze(0), output_encoding['attention_mask'].squeeze(0)])
        labels = torch.cat([torch.full((half_length,), -100), output_encoding['input_ids'].squeeze(0)])
        
        input_ids_list.append(input_ids)
        attention_masks_list.append(attention_mask)
        labels_list.append(labels)
    
    return {
        'input_ids': torch.stack(input_ids_list),
        'attention_mask': torch.stack(attention_masks_list),
        'labels': torch.stack(labels_list)
    }

In [6]:

# 3. Custom training function
def train_model(model, tokenizer, dataset, batch_size=4, learning_rate=5e-5, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    train_dataloader = DataLoader(
        torch.utils.data.TensorDataset(dataset['input_ids'], dataset['attention_mask'], dataset['labels']),
        batch_size=batch_size,
        shuffle=True
    )
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")
    
    model.save_pretrained("/kaggle/working/gpt2_finetuned_custom")
    tokenizer.save_pretrained("/kaggle/working/gpt2_finetuned_custom")
    return "/kaggle/working/gpt2_finetuned_custom"

In [7]:
# 4. Finetuning function
def finetune_gpt2_custom(instruction_response_df):
    train_df = prepare_training_data(instruction_response_df)
    
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    special_tokens = {"additional_special_tokens": ["<|endoftext|>", "input:", "output:"]}
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    dataset = prepare_dataset(train_df, tokenizer, max_length=128)
    model_path = train_model(model, tokenizer, dataset, batch_size=4, learning_rate=5e-5, epochs=3)
    
    print(f"Model fine-tuned and saved to {model_path}")
    return model_path

# Run finetuning
model_path = finetune_gpt2_custom(instruction_response_df)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Epoch 1/3: 100%|██████████| 1629/1629 [02:38<00:00, 10.29it/s]


Epoch 1/3, Average Loss: 0.1093


Epoch 2/3: 100%|██████████| 1629/1629 [02:37<00:00, 10.33it/s]


Epoch 2/3, Average Loss: 0.0702


Epoch 3/3: 100%|██████████| 1629/1629 [02:36<00:00, 10.38it/s]


Epoch 3/3, Average Loss: 0.0463
Model fine-tuned and saved to /kaggle/working/gpt2_finetuned_custom


In [8]:
!pip install langchain langchain-community faiss-cpu sentence-transformers transformers
import pandas as pd
import re
import torch
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model_path = "/kaggle/working/gpt2_finetuned_custom"  # Update with your actual path
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer.pad_token = tokenizer.eos_token

# Prepare RAG system
def setup_rag_system(instruction_response_df):
    # Create documents for vector store
    documents = [
        Document(
            page_content=row['response'], 
            metadata={"instruction": row['instruction']}
        ) for _, row in instruction_response_df.iterrows()
    ]
    
    # Initialize embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Create vector store
    vector_store = FAISS.from_documents(documents, embeddings)
    
    return vector_store

def extract_numerical_data(text):
    """Extract temperature trend value from text"""
    temp_match = re.search(r'([-+]?\d+\.\d+)°C', text)
    if temp_match:
        return temp_match.group(1)
    return None

def generate_response(query, retrieved_doc, model, tokenizer, device):
    """Generate a response using the fine-tuned model"""
    # Extract numerical data to ensure accuracy
    temp_trend = extract_numerical_data(retrieved_doc)
    
    # Prepare input for the model
    input_text = f"input: {query} output:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    # Generate response
    try:
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_length=100,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        # Decode the output
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Extract just the output part
        response = generated_text.replace(input_text, "").strip()
        
        # Ensure the response contains the correct temperature trend
        if temp_trend and temp_trend not in response:
            # Find any temperature values in the response
            existing_temp = re.search(r'([-+]?\d+\.\d+)°C', response)
            if existing_temp:
                # Replace with the correct value
                response = response.replace(existing_temp.group(1), temp_trend)
            else:
                # Add the correct value if not present
                response = f"{response} The temperature trend was {temp_trend}°C."
        
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return retrieved_doc

def answer_query(query, vector_store, model, tokenizer, device, k=1, debug=False):
    """Answer a query using the RAG system"""
    # Retrieve relevant documents
    retriever = vector_store.as_retriever(search_kwargs={"k": k})
    retrieved_docs = retriever.invoke(query)
    
    if not retrieved_docs:
        return "I don't have information about that temperature trend."
    
    # Get the most relevant document
    retrieved_text = retrieved_docs[0].page_content
    
    if debug:
        print(f"Query: {query}")
        print(f"Retrieved: {retrieved_text}")
    
    # Generate enhanced response
    response = generate_response(query, retrieved_text, model, tokenizer, device)
    
    if debug:
        print(f"Enhanced: {response}")
        
    return response

def batch_answer(queries, vector_store, model, tokenizer, device, debug=False):
    """Answer a batch of queries"""
    results = []
    for query in queries:
        result = answer_query(query, vector_store, model, tokenizer, device, debug=debug)
        results.append({"query": query, "answer": result})
    return results

# Example usage
def example_usage(instruction_response_df):
    # Initialize RAG system
    vector_store = setup_rag_system(instruction_response_df)
    
    # Test queries
    queries = [
        "What is the temperature trend in Berlin, Germany from 1900 to 2000?",
        "What's the temperature trend in Berlin, Germany between 1900 and 2000?",
        "How did temperatures change in Berlin, Germany from 1900–2000?",
        "From 1900 to 2000, what was the temperature trend in Berlin, Germany?",
        "What is the temperature change in Berlin, Germany from 1900 to 2000?",
        "What is the temperature trend in Abiko, Japan from 1900 to 2000?",
    ]
    
    # Answer queries
    results = batch_answer(queries, vector_store, model, tokenizer, device, debug=True)
    
    # Print results
    for result in results:
        print(f"\nQuery: {result['query']}")
        print(f"Answer: {result['answer']}")

# Use the system (would need to call with actual dataframe)
 example_usage(instruction_response_df)

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.47-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting python-dotenv>=0