In [None]:
#!pip install tiktoken==0.6.0 pypdf==4.0.1 langchain==0.1.1 langchain-community==0.0.13 chromadb==0.4.22 sentence-transformers==2.3.1


In [1]:
import chromadb
persistance_directory = 'vectorstore'
chroma_client = chromadb.PersistentClient(persistance_directory)


In [2]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)

# Initialize the OpenAI/Huggingface embeddings model -- check https://huggingface.co/spaces/mteb/leaderboard and choose one that is in-and around

#  Ideal when you want flexibility with different models, compatibility with Hugging Face ecosystems, or are working on applications where model speed and memory efficiency are crucial.
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 

# When accuracy and precision are more important, and you have more computational resources available, gte-large provides richer, more nuanced embeddings at the cost of being slower and more resource-intensive.
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

# read existing vector store with Chroma

vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory=persistance_directory,
    collection_name="my_collection"
)

# Create a retriever from the vectorstore
retriever = vectorstore.as_retriever()



  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
  vectorstore = Chroma(


In [3]:
query_text = "What are Biobased Polymers?"
results = vectorstore.similarity_search(query_text, k=15)

print("Search results:")
for result in results:
    print(f"Document: {result.page_content}")  # Access the document text directly using `page_content`
    print(f"Metadata: {result.metadata}")      # Access the metadata directly using `metadata`)
    
    # Check if similarity/distance is available and print it
    if hasattr(result, 'distance'):
        print(f"Distance: {result.distance}")
    elif hasattr(result, 'score'):
        print(f"Score: {result.score}")
    
    print("-" * 40)


Search results:
Document: ones and artiﬁcially synthesized ones. They are sometimes deﬁned as biocompostable polymers, especially in waste, agricultural, ﬁshery and construction industries. The term biodegradable polymer is also used formedical, pharmaceutical, and bioengineering applications. Biodegradable polymers consisting of naturally derived building blocks are also called bioabsorbable polymers, when they are speciﬁcally applied for medical, pharmaceutical, or other bioengineering applications. The importance of biobased polymers is well known, and much research and development activities concerns the use of biobased polymers in science, engineering, and industry. Generally, biobased polymers are classiﬁed into three classes: • 1st class; naturally derived biomass polymers: direct use of biomass as polymeric material including chemically modiﬁed ones such as cellulose, cellulose acetate, starches, chitin, modiﬁed starch, etc.; • 2nd class; bio-engineered polymers: bio-synthesize

In [4]:
from langchain.llms import Ollama
# Initialize the local Ollama model 
model_version ="llama3.2"
#model_version ="llama3.1:8b"
llm = Ollama(model=model_version, temperature=0)

  llm = Ollama(model=model_version, temperature=0)


In [5]:
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Step 1: Create a custom prompt using PromptTemplate
prompt_template = """
### INSTRUCTION:
You are Andy, a Sustainability Market Analyst at Gofore. Gofore is a Market Research and Sustainability Business Development company. 
Over our experience, we are to perform the following tasks:
1. Identify new value resources from bio-based side and water streams (black liquor, bark, straw, etc.)
2. Understand the key aspects, functionalities, and applications that the products have.
3. Analyze the current market demand for each of these.
4. Identify the key players in the market for them.
5. Identify the geography that each of these are focused on.

Define an expected short-term, mid-term, and long-term market growth considering factors such as government regulations, consumer demand based on trends, and corporate sustainability goals.

Now, think through the mentioned instructions and perform market analysis for the material: '''{material_name}'''.
Generate the detailed market analysis dimensions for the material using the JSON in the format provided below :
{{
  "material_name": "{material_name}",
  "dimensions": {{
    "global market demand": "<description of the global market demand and market size as available>",
    "global market drivers": [<list of key market drivers including regulation>],
    "geographic focus of the markets": [
      <list of the geographic areas driving market growth>
    ],
    "short-term (0-2 years)": {{"description":"<description of the extected market growth in 0 to 2 years>","growth rate": " <numeric range>% per annum"}},
    "mid-term (3-5 years)": {{"description":"<description of the extected market growth in 3 to 5 years>","growth rate": " <numeric range>% per annum"}},
    "long-term (6-10 years)": {{"description":"<description of the extected market growth in 6 to 10 years>","growth rate": " <numeric range>% per annum"}},
    "Potential key customers": [
      "<list of potential key customer industry sectors and fields in the core of the market growth>"
    ],
    "Maturity": "<Short description of the nature of the key customers (existing/emerging value chains)>",
    "Examples of potential customer companies": [
      "<list of 4 or 5 examples of companies>"
    ],
    "Raw materials": [
      "<list 5 to 10 raw materials>"
    ],
    "products": [
      "<list 5 to 10 products>"
    ],
    "technologies of interest": [
      "<list of technologies of interest for the identified customers>"
    ],
    "sources": [
      "<list of the information source url or document name or author name etc>"
    ]
  }}
}}

Do not provide a preamble.
### EMAIL (NO PREAMBLE):
"""


In [13]:
prompt_template="""
You are Andy, a Sustainability Market Analyst at Gofore, a Market Research and Sustainability Business Development company.

Your role is to perform the following:

Identify new value resources from bio-based side and water streams (e.g., black liquor, bark, straw).
Understand key aspects, functionalities, and applications of these materials.
Analyze the current market demand for each.
Identify key market players.
Identify geographic areas where these materials are most prominent.
Provide an analysis of short-term, mid-term, and long-term market growth considering factors such as government regulations, consumer trends, and corporate sustainability goals.
Please answer user questions only using the context provided in the input.
Do not mention anything about the context in your final answer. Your response should only contain the answer to the question.

If the answer is not found in the context, respond "I don't know".
Using this information, conduct a detailed market analysis for the given material: '''{material_name}'''.

Please generate the market analysis using the JSON format below:
{{
  "material_name": "{material_name}",
  "dimensions": {{
    "global market demand": "<description of the global market demand and market size as available>",
    "global market drivers": [<list of key market drivers including regulation>],
    "geographic focus of the markets": [
      <list of the geographic areas driving market growth>
    ],
    "short-term (0-2 years)": {{"description":"<description of the extected market growth in 0 to 2 years>","growth rate": " <numeric range>% per annum"}},
    "mid-term (3-5 years)": {{"description":"<description of the extected market growth in 3 to 5 years>","growth rate": " <numeric range>% per annum"}},
    "long-term (6-10 years)": {{"description":"<description of the extected market growth in 6 to 10 years>","growth rate": " <numeric range>% per annum"}},
    "Potential key customers": [
      "<list of potential key customer industry sectors and fields in the core of the market growth>"
    ],
    "Maturity": "<Short description of the nature of the key customers (existing/emerging value chains)>",
    "Examples of potential customer companies": [
      "<list of 4 or 5 examples of companies>"
    ],
    "Raw materials": [
      "<list 5 to 10 raw materials>"
    ],
    "products": [
      "<list 5 to 10 products>"
    ],
    "technologies of interest": [
      "<list of technologies of interest for the identified customers>"
    ],
    "sources": [
      "<list of the information source url or document name or author name etc>"
    ]
  }}
}}
Do not provide a preamble.
### EMAIL (NO PREAMBLE):
"""

In [14]:

# Step 2: Define the PromptTemplate with the correct variable
prompt = PromptTemplate(input_variables=["material_name"], template=prompt_template)

# Step 3: Format the prompt with a specific material name (query)
material_name = "Biobased Polymers"  # The material you're analyzing
formatted_prompt = prompt.format(material_name=material_name)

# Step 4: Create the QA chain using your LLM and retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,  # Your pre-configured LLM (e.g., local ollama-3.2)
    chain_type="stuff",  # Using the "stuff" chain type for retrieval-based QA
    retriever=retriever,  # The retriever you defined earlier
    return_source_documents=True  # Returns the documents used to generate the answer
)

# Step 5: Use the QA chain with the formatted query
result = qa_chain({"query": formatted_prompt})  # Call the QA chain directly



In [15]:
# Step 6: Print the result and the source documents
print("Answer:", result['result'])  # The market analysis in JSON format
if result.get("source_documents"):
    print("\nSource Documents:")
    for doc in result['source_documents']:
        print('\n',doc)


Answer: {
  "material_name": "Biobased Polymers",
  "dimensions": {
    "global market demand": "The global biobased polymers market is expected to reach USD 13.4 billion by 2025, growing at a CAGR of 8.2% from 2020 to 2025.",
    "global market drivers": [
      "Increasing demand for sustainable and eco-friendly packaging solutions",
      "Government regulations and policies promoting the use of biobased materials",
      "Growing awareness about the environmental impact of traditional plastics"
    ],
    "geographic focus of the markets": [
      "Europe",
      "North America",
      "Asia-Pacific"
    ],
    "short-term (0-2 years)": {
      "description": "The market is expected to grow at a CAGR of 9.5% from 2020 to 2022, driven by increasing demand for biodegradable packaging materials.",
      "growth rate": "9.5%-10.5%"
    },
    "mid-term (3-5 years)": {
      "description": "The market is expected to grow at a CAGR of 7.8% from 2022 to 2025, driven by the expansion of bi

Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\envs\tf-cpu-py311\Lib\asyncio\events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
  File "c:\ProgramData\Anaconda3\envs\tf-cpu-py311\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = self._ssock.recv(4096)
           ^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\envs\tf-cpu-py311\Lib\asyncio\events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
  File "c:\ProgramData\Anaconda3\envs\tf-cpu-py311\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = 

In [12]:
result

{'query': '\nYou are Andy, a Sustainability Market Analyst at Gofore, a Market Research and Sustainability Business Development company.\n\nYour role is to perform the following:\n\nIdentify new value resources from bio-based side and water streams (e.g., black liquor, bark, straw).\nUnderstand key aspects, functionalities, and applications of these materials.\nAnalyze the current market demand for each.\nIdentify key market players.\nIdentify geographic areas where these materials are most prominent.\nProvide an analysis of short-term, mid-term, and long-term market growth considering factors such as government regulations, consumer trends, and corporate sustainability goals.\n\nUsing this information, conduct a detailed market analysis for the given material: \'\'\'Biobased Polymers\'\'\'.\n\nPlease generate the market analysis using the JSON format below:\n{\n  "material_name": "Biobased Polymers",\n  "dimensions": {\n    "global market demand": "<description of the global market de