In [None]:
pip install langchain openai chromadb tiktoken

In [37]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [6]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [9]:
loader = DirectoryLoader('./abhi/', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [12]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [13]:
len(texts)

8

In [14]:
texts[2]

Document(page_content="A stakeholder is someone who is affected by the activities of an organization. Stakeholders can be internal, such as employees, or external, such as customers or suppliers. \n\nStakeholder Outcome \n\nA stakeholder outcome is a change that occurs in the life of a stakeholder as a result of the activities of an organization. Stakeholder outcomes can be positive or negative, and they can be short-term or long-term. \n\nImpact Report \n\nAn impact report is a document that describes the impact of an organization's activities. Impact reports can be used to communicate the organization's impact to stakeholders, funders, and the public. \n\nImpact Risk \n\nAn impact risk is a factor that could prevent an organization from achieving its goals. Impact risks can be internal, such as a lack of resources, or external, such as a change in the political climate. \n\nIndicator", metadata={'source': 'abhi/ontology.txt'})

In [17]:
texts[7]

Document(page_content="- Argentina's senate rejected a bill to legalise abortion in the first 14 weeks of pregnancy in August 2018.\n- Operator: Ministry of Early Childhoood, Salta Province Developer: Microsoft; Conin Foundation \xa0Country: Argentina; Brazil Sector: Govt - health Purpose: Predict teenager pregnancy Technology: Prediction algorithm Issue: Accuracy/reliability; Privacy; Appropriateness/need; Effectiveness/value Transparency: Governance; Black box; Marketing", metadata={'source': 'abhi/microsoft-teen-pregnancy-predictions.txt'})

In [25]:
persist_directory = 'db'
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [26]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [27]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)

In [28]:
retriever = vectordb.as_retriever()

In [29]:
docs = retriever.get_relevant_documents("what is cids?")

In [30]:
len(docs)

4

In [31]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [32]:
retriever.search_type

'similarity'

In [33]:
retriever.search_kwargs

{'k': 2}

In [38]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [39]:
query = "What is an organization?"
llm_response = qa_chain(query)


In [40]:
llm_response

{'query': 'What is an organization?',
 'result': ' An organization is a group of people who work together to achieve a common goal.',
 'source_documents': [Document(page_content='Organization \n\nAn organization is a group of people who work together to achieve a common goal. Organizations can be large or small, public or private, and profit-making or non-profit. \n\nImpact Model \n\nAn impact model is a tool that organizations use to understand and measure their impact. It helps organizations to identify their goals, track their progress, and communicate their impact to others. \n\nProgram \n\nA program is a set of activities that are designed to achieve a specific goal. Programs can be implemented by organizations, governments, or individuals. \n\nService \n\nA service is a product or activity that is provided to others. Services can be provided by organizations, governments, or individuals. \n\nActivity \n\nAn activity is a specific task that is performed as part of a program or ser

In [42]:
llm_response = qa_chain('who are the stakeholders in the microsoft teen pregnancy prediction story?')
llm_response[result]

{'query': 'who are the stakeholders in the microsoft teen pregnancy prediction story?',
 'result': ' The stakeholders in the Microsoft teen pregnancy prediction story are the Ministry of Early Childhood in Salta Province, Microsoft, Conin Foundation, Argentina, Brazil, the government, and the people affected by the prediction algorithm.',
 'source_documents': [Document(page_content="- Released: 2016\n- Can you improve this page?Share your insights with us\n- The Technology Platform for Social Intervention (Plataforma Tecnológica de Intervención Social) is a controversial algorithmic system used to predict teenage pregnancy in Argentina, Brazil, and Colombia.\n- Developed in 2016 by Microsoft for the Ministry of Early Childhood in Salta province, Argentina, the system draws on age, ethnicity, country of origin, disability, and socio-economic data of some 200,000 women and girls, including 12,000 between the ages of 10 and 19.\n- Microsoft announced the system is 'one of the pioneering c

In [44]:
llm_response = qa_chain('find impact scale in the microsoft teen pregnancy prediction story?')
llm_response

{'query': 'find impact scale in the microsoft teen pregnancy prediction story?',
 'result': ' It is difficult to determine the impact scale of the Microsoft teen pregnancy prediction story without more information.',
 'source_documents': [Document(page_content="- Released: 2016\n- Can you improve this page?Share your insights with us\n- The Technology Platform for Social Intervention (Plataforma Tecnológica de Intervención Social) is a controversial algorithmic system used to predict teenage pregnancy in Argentina, Brazil, and Colombia.\n- Developed in 2016 by Microsoft for the Ministry of Early Childhood in Salta province, Argentina, the system draws on age, ethnicity, country of origin, disability, and socio-economic data of some 200,000 women and girls, including 12,000 between the ages of 10 and 19.\n- Microsoft announced the system is 'one of the pioneering cases in the use of AI data' in Argentina. Then governor of Salta Juan Manuel Urtubey claimed it would enable authorities to 

In [45]:
llm_response['result']

' It is difficult to determine the impact scale of the Microsoft teen pregnancy prediction story without more information.'

In [47]:
qa_chain('find the following in the Microsoft teen pregnancy prediction story: Organization, Impact Model, Program, Service, Activity, Input, Output, Outcome, Stakeholder, Stakeholder Outcome, Impact Report, Impact Risk, Indicator, Indicator Report, Impact Scale, Impact Depth, Impact Duration')

{'query': 'find the following in the Microsoft teen pregnancy prediction story: Organization, Impact Model, Program, Service, Activity, Input, Output, Outcome, Stakeholder, Stakeholder Outcome, Impact Report, Impact Risk, Indicator, Indicator Report, Impact Scale, Impact Depth, Impact Duration',
 'result': ' The Microsoft teen pregnancy prediction story includes an organization (Microsoft), a program (Microsoft’s Teen Pregnancy Prediction Program), a service (Microsoft’s Predictive Analytics Service), activities (collecting and analyzing data), inputs (data and resources), outputs (predictions and insights), outcomes (reduction in teen pregnancy rates), stakeholders (teens, parents, and healthcare providers), stakeholder outcomes (reduced teen pregnancy rates and improved access to healthcare services), an impact report (Microsoft’s annual report on the program’s results), an impact risk (the risk of teen pregnancy), indicators (data points that measure the success of the program), an 

In [48]:
qa_chain('provide the following from the Microsoft teen pregnancy prediction story: Organization, Impact Model, Program, Service, Activity, Input, Output, Outcome, Stakeholder, Stakeholder Outcome, Impact Report, Impact Risk, Indicator, Indicator Report, Impact Scale, Impact Depth, Impact Duration')

{'query': 'provide the following from the Microsoft teen pregnancy prediction story in csv format: Organization, Impact Model, Program, Service, Activity, Input, Output, Outcome, Stakeholder, Stakeholder Outcome, Impact Report, Impact Risk, Indicator, Indicator Report, Impact Scale, Impact Depth, Impact Duration',
 'result': " I don't know.",
 'source_documents': [Document(page_content="- Released: 2016\n- Can you improve this page?Share your insights with us\n- The Technology Platform for Social Intervention (Plataforma Tecnológica de Intervención Social) is a controversial algorithmic system used to predict teenage pregnancy in Argentina, Brazil, and Colombia.\n- Developed in 2016 by Microsoft for the Ministry of Early Childhood in Salta province, Argentina, the system draws on age, ethnicity, country of origin, disability, and socio-economic data of some 200,000 women and girls, including 12,000 between the ages of 10 and 19.\n- Microsoft announced the system is 'one of the pioneeri