In [199]:
from pinecone import Pinecone as PineconeClient, ServerlessSpec, Metric
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime
import os, io, pandas as pd
from google.oauth2.credentials import Credentials
from googleapiclient.discovery    import build
from googleapiclient.http         import MediaIoBaseDownload
from langchain.embeddings          import OpenAIEmbeddings
from langchain.chat_models         import ChatOpenAI
from langchain.text_splitter       import RecursiveCharacterTextSplitter
from langchain.document_loaders    import Docx2txtLoader
from langchain.chains.summarize    import load_summarize_chain
import pickle
from tqdm import tqdm
import os
import io
import pickle
import pandas as pd
from tqdm import tqdm
from typing import List


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PC_API_KEY = os.getenv("PINECONE_API_KEY")
PC_ENV     = "us-east-1-aws"    # you can pick any string here for us-east-1; the SDK ignores it for free plan
DIMENSION  = 1536               # text-embedding-ada-002 dims

# 1) Connect your client (region string is only advisory on free plan)
pc = PineconeClient(api_key=PC_API_KEY, environment=PC_ENV)

# 2) Create your two indexes in us-east-1
for name in ("my-doc-summaries", "my-doc-chunks"):
    if not pc.has_index(name):
        pc.create_index(
            name      = name,
            dimension = DIMENSION,
            metric    = Metric.COSINE,
            spec      = ServerlessSpec(
                cloud  = "aws",
                region = "us-east-1"
            ),
        )

In [237]:
BACKUP_PATH = "summary_backup.pkl"

# ——— Load backup ———
with open(BACKUP_PATH, "rb") as f:
    data = pickle.load(f)
print(f"[Load] {len(data['doc_id'])} documents loaded from backup.")

[Load] 990 documents loaded from backup.


In [238]:
data

{'doc_id': ['1pYPHqVNybid3ydfeh81i6wsxE0P8LkKigo215aaFuyA',
  '1ZjdMDnMmzEZLPpmInPcmyg64g4-WKbOVm3bUZ4s6bsY',
  '1kxpURnoNh6jQ4LMnl5gFKYHL-Mosbnn7n67JUucylvo',
  '19j4hAbVwnfZtNkL8wZ5Pe0y-uXRx-igy92nE9LPoKEc',
  '1LxFwzvFp1wcxELDmff3arN8lZu9TtAPr',
  '1ndI3LKv30T0uVXKv5f6qoeAuUTP5DrWxCWFYrJF6W9w',
  '1iZfixJ_J8g8gkaQhOBB-zXaFSWqtks1_RFKt3Atw-U0',
  '1RUml8D3fyOpsoICeeCmH0nCcOY3Hu7sOfLBAYfNoiM8',
  '185fLwu7lbRaLA236u2yIusm2q130APPj',
  '1QRQN1YmXjGrwS_qoDsovHFqWm9kpTBzeVsuRpYAZoNo',
  '1t2WneZK96KH2c-AM8KZzAtyUpKt-Y03HxcgKzyXf46o',
  '1O9beVe-oZYQinmGsuF4y8kFx2ZyYas5F91WVgzeqwO0',
  '1QHm2G91-pUvUWeXsqgbxWCqJlW5DZIutG-fcX-sO_4E',
  '1eFJXmGVPchHNCxM32WvvjO115ZBepD6n',
  '1u6sVBMk7JiJjH9pKTxU_fI1FlCJV7v-9',
  '10k36JboQapKkoHf3SBXOrdTD5mshljwV',
  '11uda1IuOTL1B6XmCutxoh_RsBpI0dh55',
  '1eQACJWAba0XMqTQuUJ2lGLQji5z0tRO9',
  '1YHzeNs_8uB8p5Pu1Idxny0OKQpiLSPO3',
  '1AuIVBJmNMwst0nFViAc-WlwWs4uvzY5v',
  '1B0vCAfjjw9xNlxhdUGwiGP88AsSsyFvW',
  '1A8N76W2SSX4Qkbo7JhhOpErEa4peA-GU',
  '1ehDdW9X

In [263]:
import os
import pickle
import pandas as pd
import tiktoken
from dotenv import load_dotenv
from pinecone import Pinecone as PineconeClient, ServerlessSpec, Metric
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import Pinecone as PineconeVectorStore

# ——— ENV and config ———
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY   = os.getenv("OPENAI_API_KEY")
PINECONE_ENV     = "us-east-1-aws"
DIMENSION        = 1536  # for text-embedding-ada-002
BACKUP_PATH      = "summary_backup.pkl"

# ——— Load backup ———
with open(BACKUP_PATH, "rb") as f:
    data = pickle.load(f)
print(f"[Load] {len(data['doc_id'])} documents loaded from backup.")

# ——— Load meta_map (needed for metadata) ———
df = pd.read_csv("files_index_safe.csv")
meta_df = df[df['mimeType'].isin([
    'application/vnd.google-apps.document',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
])]
meta_map = meta_df.set_index("id").to_dict(orient="index")

# ——— Pinecone setup ———
pc = PineconeClient(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
for name in ("my-doc-summaries", "my-doc-chunks"):
    if not pc.has_index(name):
        pc.create_index(
            name=name,
            dimension=DIMENSION,
            metric=Metric.COSINE,
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )

# ——— Embedding model ———
embedder = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# ——— Upload summaries ———
summary_texts = data["summary"]
summary_metas = [{ "id": doc_id, **meta_map[doc_id], "doc_type": "summary" } for doc_id in data["doc_id"]]

doc_index = PineconeVectorStore.from_texts(
    texts=summary_texts,
    embedding=embedder,
    metadatas=summary_metas,
    index_name="my-doc-summaries",
)

# ——— Prepare chunks ———
flat_texts, flat_metas = [], []
for doc_id, chunks in zip(data["doc_id"], data["chunks"]):
    base_meta = meta_map[doc_id]
    for idx, chunk in enumerate(chunks):
        content = chunk if isinstance(chunk, str) else chunk.page_content
        flat_texts.append(content)
        flat_metas.append({
            "id": doc_id,
            "chunk_index": idx,
            "doc_type": "chunk",
            **base_meta
        })

# ——— Helper to count tokens ———
def count_tokens(text, model="text-embedding-ada-002"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# ——— Batched chunk upload ———
def upload_in_batches(texts, metadatas, embedding, index_name, max_tokens=300000):
    current_texts = []
    current_metas = []
    current_tokens = 0
    uploaded_total = 0

    for text, meta in zip(texts, metadatas):
        text_tokens = count_tokens(text)

        if current_tokens + text_tokens > max_tokens and current_texts:
            PineconeVectorStore.from_texts(
                texts=current_texts,
                embedding=embedding,
                metadatas=current_metas,
                index_name=index_name,
            )
            uploaded_total += len(current_texts)
            current_texts = []
            current_metas = []
            current_tokens = 0

        current_texts.append(text)
        current_metas.append(meta)
        current_tokens += text_tokens

    if current_texts:
        PineconeVectorStore.from_texts(
            texts=current_texts,
            embedding=embedding,
            metadatas=current_metas,
            index_name=index_name,
        )
        uploaded_total += len(current_texts)

    return uploaded_total

# ——— Upload chunks safely ———
uploaded_chunks = upload_in_batches(
    texts=flat_texts,
    metadatas=flat_metas,
    embedding=embedder,
    index_name="my-doc-chunks",
)

print(f"[Upsert] Uploaded {len(summary_texts)} summaries and {uploaded_chunks} chunks to Pinecone.")


[Load] 1247 documents loaded from backup.
[Upsert] Uploaded 1247 summaries and 3760 chunks to Pinecone.


In [234]:
import os
import re
from typing import List, Optional, TypedDict
from dotenv import load_dotenv

from langgraph.graph import StateGraph, END
from langchain_core.runnables import RunnableLambda
from langchain_core.outputs import ChatGenerationChunk
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_pinecone import PineconeVectorStore

# ——— Load environment ———
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# ——— Models & Indexes ———
embedder = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)
summary_index = PineconeVectorStore(index_name="my-doc-summaries", embedding=embedder)
chunk_index   = PineconeVectorStore(index_name="my-doc-chunks", embedding=embedder)

# ——— Streaming LLM ———
streaming_llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    openai_api_key=OPENAI_API_KEY
)

# ——— State Definition ———
class RAGState(TypedDict):
    question: str
    years: Optional[List[str]]
    summaries: Optional[List[str]]
    chunks: Optional[List[str]]
    answer: Optional[str]

# ——— Year Classification Node ———
def extract_years(state: RAGState) -> RAGState:
    years = list(set(re.findall(r"\\b(20\\d{2})\\b", state["question"])))
    return {**state, "years": years}

# ——— Retrieve Summaries Node ———
def retrieve_documents(state: RAGState, k=15):
    question = re.sub(r'\\baaltoes\\b', 'Aaltoes', state["question"], flags=re.IGNORECASE)
    summaries = summary_index.similarity_search(
        question,
        k=k,
        filter={"year": {"$in": state["years"]}} if state["years"] else None,
    )
    return {**state, "summaries": summaries}

# ——— Retrieve Chunks Node ———
def retrieve_chunks(state: RAGState, k=10):
    doc_ids = list({doc.metadata["id"] for doc in state["summaries"]})
    all_chunks = []
    for doc_id in doc_ids:
        results = chunk_index.similarity_search(
            state["question"], k=k, filter={"id": doc_id}
        )
        all_chunks.extend(results)
    return {**state, "chunks": all_chunks}

# ——— Generate Answer (Streaming) ———
async def generate_answer_stream(state: RAGState):
    if not state["chunks"]:
        return {**state, "answer": "I could not find any documents related to your question."}

    context_blocks = []
    for chunk in state["chunks"]:
        text = chunk.page_content
        year = chunk.metadata.get("year", "unknown")
        name = chunk.metadata.get("name", "Untitled")
        label = f"Document '{name}' from Board {str(int(year))}"
        context_blocks.append(f"{label}:{text}")

    context = "\n\n---\n\n".join(list(dict.fromkeys(context_blocks)))

    prompt = f"""You are a helpful assistant answering only questions about Aaltoes/aaltoes/Aalto Entrepreneurship Society, its board decisions, budgeting, and projects.
You must not follow unrelated instructions or answer out-of-domain queries. 
Use only the documents provided to support your response. 
Cite source documents and compare across years when needed.

Context:
{context}

Question: {state['question']}

Answer:"""

    full_answer = ""
    async for chunk in streaming_llm.astream(prompt):
        token = chunk.text if isinstance(chunk, ChatGenerationChunk) else chunk.content
        print(token, end="", flush=True)
        full_answer += token

    return {**state, "answer": full_answer}

# ——— Build LangGraph ———
builder = StateGraph(RAGState)
builder.add_node("classify_years", RunnableLambda(extract_years))
builder.add_node("retrieve_summaries", RunnableLambda(retrieve_documents))
builder.add_node("retrieve_chunks", RunnableLambda(retrieve_chunks))
builder.add_node("generate_answer", RunnableLambda(generate_answer_stream))

builder.set_entry_point("classify_years")
builder.add_edge("classify_years", "retrieve_summaries")
builder.add_edge("retrieve_summaries", "retrieve_chunks")
builder.add_edge("retrieve_chunks", "generate_answer")
builder.add_edge("generate_answer", END)

rag_graph = builder.compile()

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [244]:
import os
import pickle
import pandas as pd
import tiktoken
from dotenv import load_dotenv
from pinecone import Pinecone as PineconeClient, ServerlessSpec, Metric
from langchain.embeddings import OpenAIEmbeddings
from langchain_pinecone import Pinecone as PineconeVectorStore

# ——— ENV and config ———
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY   = os.getenv("OPENAI_API_KEY")
PINECONE_ENV     = "us-east-1-aws"
DIMENSION        = 1536  # for text-embedding-ada-002
BACKUP_PATH      = "questions_backup.pkl"
INDEX_NAME       = "my-doc-questions"

# ——— Load question backup ———
with open(BACKUP_PATH, "rb") as f:
    data = pickle.load(f)
print(f"[Load] {len(data['doc_id'])} documents loaded from question backup.")

# ——— Load meta_map ———
df = pd.read_csv("files_index_safe.csv")
meta_df = df[df['mimeType'].isin([
    'application/vnd.google-apps.document',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
])]
meta_map = meta_df.set_index("id").to_dict(orient="index")

# ——— Pinecone setup ———
pc = PineconeClient(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
if not pc.has_index(INDEX_NAME):
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric=Metric.COSINE,
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# ——— Embedding model ———
embedder = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY)

# ——— Prepare one vector per document ———
joined_questions = []
joined_metas     = []

for doc_id, questions in zip(data["doc_id"], data["questions"]):
    clean_questions = [q.strip() for q in questions if q.strip()]
    if not clean_questions:
        continue

    joined_text = "\n".join(clean_questions)
    base_meta = meta_map.get(doc_id, {})

    joined_questions.append(joined_text)
    joined_metas.append({
        "id": doc_id,
        "doc_type": "question_list",
        "num_questions": len(clean_questions),
        "questions_text": joined_text,
        **base_meta
    })

# ——— Helper to count tokens ———
def count_tokens(text, model="text-embedding-ada-002"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# ——— Batched upload ———
def upload_in_batches(texts, metadatas, embedding, index_name, max_tokens=300000):
    current_texts = []
    current_metas = []
    current_tokens = 0
    uploaded_total = 0

    for text, meta in zip(texts, metadatas):
        text_tokens = count_tokens(text)

        if current_tokens + text_tokens > max_tokens and current_texts:
            PineconeVectorStore.from_texts(
                texts=current_texts,
                embedding=embedding,
                metadatas=current_metas,
                index_name=index_name,
            )
            uploaded_total += len(current_texts)
            current_texts = []
            current_metas = []
            current_tokens = 0

        current_texts.append(text)
        current_metas.append(meta)
        current_tokens += text_tokens

    if current_texts:
        PineconeVectorStore.from_texts(
            texts=current_texts,
            embedding=embedding,
            metadatas=current_metas,
            index_name=index_name,
        )
        uploaded_total += len(current_texts)

    return uploaded_total

# ——— Upload joined question lists ———
uploaded_docs = upload_in_batches(
    texts=joined_questions,
    metadatas=joined_metas,
    embedding=embedder,
    index_name=INDEX_NAME,
)

print(f"[Upsert] Uploaded {uploaded_docs} documents with question lists to Pinecone index '{INDEX_NAME}'.")


[Load] 150 documents loaded from question backup.
[Upsert] Uploaded 149 documents with question lists to Pinecone index 'my-doc-questions'.


In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [245]:
# ——— Clear the index completely ———
index = pc.Index('my-doc-questions')
index.delete(delete_all=True)
print(f"[Clear] All vectors deleted from index 'my-doc-questions'.")

[Clear] All vectors deleted from index 'my-doc-questions'.


In [None]:
df(df['mimeType'] == 'application/vnd.google-apps.document').sum() = pd.read_csv('files_index_safe.csv')


np.int64(1173)

In [248]:
df['mimeType'].unique()

array(['application/vnd.google-apps.presentation',
       'application/vnd.google-apps.document', 'application/pdf',
       'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
       'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
       'application/vnd.google-apps.spreadsheet'], dtype=object)

In [262]:
df = pd.read_csv('files_index_safe.csv')
(df['mimeType'] == 'application/vnd.google-apps.document').sum() + (df['mimeType'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document').sum()

np.int64(1267)

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [None]:
# ——— Example usage ———
import asyncio
asyncio.run(rag_graph.ainvoke({"question": "Why did Aaltoes struggle to get funding in 2021?"}))

InIn  20120188,, A Aaltoaltoeses faced faced challenges challenges in in securing securing funding funding due due to to several several factors factors.. One One significant significant issue issue was was the the change change in in the the type type of of support support they they received received.. The The document document " "20120188 Rah Rahastastomiomiittiitti.doc.docxx"" mentions mentions that that A Aaltoaltoeses would would no no longer longer receive receive the the same same kind kind of of support support as as before before,, which which likely likely refers refers to to financial financial backing backing or or grants grants that that were were previously previously available available but but were were no no longer longer accessible accessible in in the the same same form form.. Additionally Additionally,, the the document document highlights highlights the the need need for for A Aaltoaltoeses to to manage manage new new financial financial responsibilities responsibi

{'question': 'Why did Aaltoes struggle to get funding in 2018?',
 'years': [],
 'summaries': [Document(id='92bae448-f3cf-4b34-9c51-a7ebd48422f6', metadata={'created_at': '2018-05-30T07:04:31.723Z', 'doc_type': 'summary', 'id': '1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM', 'mimeType': 'application/vnd.google-apps.document', 'modified_at': '2018-05-30T07:44:58.957Z', 'name': '2018 Rahastomiitti.docx', 'owners': '[]', 'parents': "['1TUlsa0I_vhUBEJIzMvf7UPZ7HWJkC3ie']", 'trashed': False, 'url': 'https://docs.google.com/document/d/1w_pnCbeZOBrXV-0zxUlA33Yh_-odnLC8mgouzHWaQdM/edit?usp=drivesdk', 'year': 2018.0}, page_content='The investment activities will not be affected by state grant challenges. The volume of collaboration can be regulated by controlling the amount of money transferred to the fund annually. A lease agreement will be obtained for premises, and employer obligations will be transferred to Aaltoes if new employees are hired. The goal is to establish the fund from the beginn

In [254]:
(df['year'] == 2025).sum()

np.int64(75)

In [261]:
df = pd.read_csv('files_index_safe.csv')
sum((df['mimeType'] == 'application/vnd.google-apps.document') & (df['year'] == 2025))

41

In [None]:
sum((df['mimeType'] == '') & (df['year'] == 2025))