# Testing out creating a Vector database with Chroma and Langchain

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma


In [None]:
dict = {"example1": """Here is a synthetic summary of polling data in the UK:

Recent opinion polls in the United Kingdom have shown a tight race between the governing Conservative Party and the opposition Labour Party. The latest survey from YouGov has Labour holding a narrow 2-point lead over the Tories among decided voters, with 36% supporting Labour and 34% backing the Conservatives. However, other pollsters like Savanta ComRes have found the two parties running neck-and-neck at 33% each.

The smaller Liberal Democrat party is polling in the 12-15% range, while the Green Party and Reform UK are each capturing around 5-7% of the vote in most surveys. There are regional variations, with Labour performing more strongly in urban areas like London, while the Conservatives maintain advantages in many rural constituencies. 

On key issues, the public appears closely divided on the economy, with pluralities trusting Labour slightly more on cost of living but the Tories ahead on economic management. The Conservatives hold leads on immigration and crime, while Labour is seen as stronger on the NHS and education.

Looking ahead to the next general election, expected in 2024 or early 2025, polling averages currently put Labour and the Conservatives in a statistical tie nationally. However, due to the distribution of support, models suggest Labour may have a slight edge in being able to secure a governing majority. Of course, much could still change over the remaining years of this parliament.""",
"example2": """Here is another synthetic summary of polling data in the UK:

The latest opinion polling shows the Conservative Party, led by Prime Minister Rishi Sunak, holding a slender lead over the opposition Labour Party. According to an average compiled by Britain Elects, the Tories are at 34% among decided voters, with Labour trailing just behind at 32%.

However, there is significant variation between individual pollsters. A recent Redfield & Wilton survey put the Conservatives up 6 points, while a Survation poll had Labour narrowly ahead by 2%. The leftwing party appears to be benefiting from a squeeze on support for the Liberal Democrats, who are down to around 10%.

On the key issue of the economy, voters give the Conservatives only a slight edge in being trusted to handle it best. But Labour has opened up clear leads on priorities like the NHS, education, and the cost-of-living crisis according to many surveys.

Looking at leader favorability ratings, both Sunak and Labour's Keir Starmer remain unpopular overall, with roughly 60% viewing each unfavorably. However, Starmer scores better on attributes like competence and vision for the country.

If an election were held imminently, the polls suggest a hung parliament is one of the likelier outcomes, with no single party holding a majority. The Conservatives may hold a slight advantage in being able to form a governing coalition with smaller unionist parties.

Of course, these are just a snapshot in time and much could change in voting intentions over the remaining years before the next scheduled election in late 2024. The state of the economy will likely be the critical factor driving voters' choices."""}

for key, value in dict.items():
    with open(f"{key}.txt", "w") as file:
        file.write(value)


In [None]:
import os

folder_path = "../data/text_summaries/"

# List all files and directories in the specified folder
files_and_directories = os.listdir(folder_path)

# Filter out only the file paths
file_paths = [os.path.join(folder_path, file) for file in files_and_directories if os.path.isfile(os.path.join(folder_path, file))]

# Print the file paths
for file_path in file_paths:
    print(file_path)

file_path

In [None]:
# Load the document, split it into chunks, embed each chunk and load it into the vector store.
examples = [file_path]

for doc in examples:
    raw_documents = TextLoader(doc).load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(raw_documents)
    print(documents)

In [None]:
from langchain import hub
from langchain.llms import (
    HuggingFaceHub,
)
from langchain.embeddings import HuggingFaceEmbeddings 
import chromadb

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    ),
    # persist_directory="./chromadb",
)

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [None]:
#fine tuned team used this prompt
prompt_template = """Given a query, return the a summary of the most similar descriptions on the polling descriptions contained in the contect provided.
 
### query: {query}

 
### Context: {context}

 
### Summary:
"""

In [None]:
from langchain.prompts import PromptTemplate
prompt = PromptTemplate.from_template(
    prompt_template
)
prompt

In [None]:
from operator import itemgetter
from langchain_core.runnables import RunnableParallel, RunnableLambda

chain = (
        {
        "context": itemgetter("query") 
            | retriever, 
            #| RunnableLambda(format_docs),
        "query": itemgetter("query"),
    }
    | prompt
   # | ft_llm
   # | StrOutputParser()
)
chain

In [None]:
q = "What do young men feel about websites"
print(chain.invoke({"query": q}))

In [None]:
prompt = chain.invoke({"query": q}).to_string()

In [None]:
print(prompt)

In [None]:
import os
#porject specfic
import boto3
from dotenv import load_dotenv, find_dotenv
from langchain.llms.bedrock import Bedrock
import pandas as pd
#local
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import BedrockChat
# from langchain_anthropic import ChatAnthropic
from pathlib import Path
class LLMHandler:
    """LLM handler class - set up client
    """
    def __init__(self, env_path):
        load_dotenv(env_path)
        boto3.setup_default_session(aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID'),
                            aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY'),
                           aws_session_token = os.getenv('AWS_SESSION_TOKEN'))
        client = boto3.client(service_name='bedrock-runtime',
                       region_name=os.getenv('AWS_DEFAULT_REGION'))
        self.llm = BedrockChat(model_id = "anthropic.claude-3-sonnet-20240229-v1:0",
              client = client,
             )
        self.problem_cases = {}
    def get_data(self,
                 prompt,
                 data) -> pd.DataFrame:
        """summarise input speech data
        Args:
            prompt (langchain PromptTemplate): prompt to be passed to LLM
        Returns:
        """
        output = pd.DataFrame()
        chain = prompt | self.llm
        response = chain.invoke({"prompt":prompt,
                                "data": data})
        return response
#%%
#---- prompt
prompt = PromptTemplate(
    # template = """
    # \n\nHuman: You are a super helpful robot that does exactly as told.
    # Outline the purpose of this poll and briefly describe the data.
    # Include where the poll came from, the question number, and a description of the CSV file.
    # Do not exceed more than 200 words.
    # write the poll name, the table number, the question and a summary of the data
    # {data}
    # Do not repeat instructions back to me, just complete the task.
    #  \n\nAssistant:""",
    template = """
    \n\nHuman: You are a super helpful robot that does exactly as told.
    For each question in this poll, outline the purpose of the question and briefly describe the outcomes of the poll.
    Include where the question came from, the question number, and a description of the CSV file.
    Do not exceed more than 200 words.
    seperate repsonses for each question with a ---
    write the poll name, the table number, the question and a summary of the data
    {data}
    Do not repeat instructions back to me, or return anything else just complete the task.
     \n\nAssistant:""",
    input_variables=["data"])
## --------------- set up CFG adn LLM
base_dir = Path(os.getcwd()).parents[0]
data_dir = base_dir /'data'
example_data_path = data_dir / 'savanta_data'
poll_files = [x for x in os.listdir(example_data_path) if '.csv' in x]
#%%
handler = LLMHandler(env_path =base_dir/ '.env')
for file in poll_files[0:1]:
    data = pd.read_csv(example_data_path /file)
    response = handler.get_data(prompt=prompt,
                        data = data.to_csv())
#%%
# with open(f"{file}.txt", "w") as file:
#     file.write(response.dict()['content'])
# data_path = data_dir/'savanta_data/Omni_W184_HomelessAndPolicePR_tables_Private.xlsx'
# sheets = parse_santava_excel(data_path)
# poll_data = ''
# for s in sheets:
#     poll_data = ''.join([poll_data, s.to_csv()])
# response = handler.get_data(prompt=prompt,
#                         data = poll_data)





