In [None]:
pip install requests pandas faiss-cpu langchain langchain-groq

In [None]:
#-------------------------------PART 1: WORLD BANK DATA INTO VECTORSTORE---------------------------------------
import requests
import os
import pandas as pd
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [None]:
# Define the directory where vectorstores will be saved
BASE_VECTORSTORE_DIR = "/content/vectorstores"

# Ensure the directory exists
os.makedirs(BASE_VECTORSTORE_DIR, exist_ok=True)

In [None]:
# -------------------- CONFIG --------------------
YEARS = ["2021", "2022","2023"]
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
VECTORSTORE_DIR = "debt_vectorstore_parallel"
LLM_MODEL = "gemma2-9b-it"

In [None]:
# -------------------- STEP 1: Get all Debt Indicators --------------------
def get_debt_indicators():
    url = "https://api.worldbank.org/v2/indicator?format=json&source=6&per_page=500"
    resp = requests.get(url)
    indicators = resp.json()[1]
    return [(i["id"], i["name"], i["sourceNote"]) for i in indicators]

In [None]:
# -------------------- STEP 2: Get all Country Codes --------------------
def get_all_country_codes():
    url = "https://api.worldbank.org/v2/sources/6/country?per_page=300&format=JSON"
    countries = requests.get(url).json()
    rows = countries["source"][0]["concept"][0]["variable"]
    return [(item["id"], item["value"]) for item in rows]

In [None]:
# -------------------- GLOBAL SESSION CONFIG --------------------
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
session.mount("https://", adapter)
session.mount("http://", adapter)

In [None]:
# -------------------- STEP 3: Fetch and Store Data for a Country (Actual) --------------------
def fetch_and_store_for_country(country, indicators, years, session, embedding_model):
    country_code, country_name = country
    all_data = []

    for indicator_id, indicator_name, source_note in indicators:
        year_range = f"{years[0]}:{years[-1]}"
        url = f"https://api.worldbank.org/v2/country/{country_code}/indicator/{indicator_id}?date={year_range}&format=json"
        try:
            r = session.get(url)
            json_data = r.json()
            if r.status_code == 200 and len(json_data) > 1:
                for dp in json_data[1]:
                    if dp["value"] is not None:
                        all_data.append({
                            "country": country_name,
                            "indicator_id": indicator_id,
                            "indicator_name": indicator_name,
                            "indicator_note": source_note,
                            "value": dp["value"],
                            "year": dp["date"]
                        })
        except Exception as e:
            print(f"❌ Error fetching {indicator_id} for {country_code}: {e}")

    # Prepare documents and save vectorstore for this country
    if all_data:
        docs = prepare_docs(all_data)
        save_dir = os.path.join(BASE_VECTORSTORE_DIR, country_code)
        create_and_save_vectorstore(docs, save_dir, embedding_model)
        print(f"✅ Processed {country_name}: {len(all_data)} entries saved.")
    else:
        print(f"⚠️ No data found for {country_name}.")

In [None]:
# -------------------- STEP 3: Fetch debt data for indicators & countries #Parallel Processing --------------------
def fetch_all_debt_data(indicators, countries, years=YEARS):
    all_data = []

    def fetch_data_point(indicator_id, indicator_name, source_note, country_code, country_name, year):
        try:
            url = f"https://api.worldbank.org/v2/country/{country_code}/indicator/{indicator_id}?date={year}&format=json"
            r = requests.get(url)
            json_data = r.json()
            if r.status_code == 200 and len(json_data) > 1:
                return [
                    {
                        "country": country_name,
                        "indicator_id": indicator_id,
                        "indicator_name": indicator_name,
                        "indicator_note": source_note,
                        "value": dp["value"],
                        "year": dp["date"]
                    }
                    for dp in json_data[1] if dp["value"] is not None
                ]
        except Exception as e:
            print(f"Error fetching {indicator_id} for {country_code} in {year}: {e}")
        return []

    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = [
            executor.submit(fetch_data_point, indicator_id, indicator_name, source_note, country_code, country_name, year)
            for indicator_id, indicator_name, source_note in indicators
            for country_code, country_name in countries
            for year in years
        ]
        for future in as_completed(futures):
            all_data.extend(future.result())

    return all_data

In [None]:
# -------------------- STEP 4: Convert to Document format --------------------
def prepare_docs(data):
    docs = []
    for entry in data:
        content = (
            f"{entry['country']} had {entry['value']} in {entry['indicator_name']} in {entry['year']}. "
            f"Definition: {entry['indicator_note']}"
        )
        metadata = {
            "country": entry["country"],
            "indicator": entry["indicator_name"],
            "year": entry["year"]
        }
        docs.append(Document(page_content=content, metadata=metadata))
    return docs

In [None]:
# -------------------- STEP 5: Build and save vectorstore --------------------
def create_and_save_vectorstore(docs, save_dir, embedding_model):
    os.makedirs(save_dir, exist_ok=True)
    vectorstore = FAISS.from_documents(docs, embedding_model)
    vectorstore.save_local(save_dir)

In [None]:
from huggingface_hub import login
login("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")

In [None]:
#-----------STEP 6: MAIN EXECUTION ---------------
if __name__ == "__main__":
    print("📥 Fetching World Bank debt indicators...")
    all_indicators = get_debt_indicators()

    print("🌍 Fetching country list...")
    countries = get_all_country_codes()

    print("🚀 Starting parallel data fetch and vectorstore creation...")

    # Initialize embedding model and session for persistent connections
    embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    session = requests.Session()

    # Configure connection pooling (you can adjust the pool size as needed)
    adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    # Fetch and store data in parallel for each country
    with ThreadPoolExecutor(max_workers=100) as executor:
        futures = [
            executor.submit(fetch_and_store_for_country, country, all_indicators, YEARS, session, embedding_model)
            for country in countries
        ]
        for future in as_completed(futures):
            future.result()  # Wait for all futures to complete

    print("✅ Data fetch and vectorstore creation completed!")
