## Install Important Libraries Here

In [1]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.0.350-py3-none-any.whl (809 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain)
  Downloading langchain_community-0.0.3-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2,>=0.1 (from langchain)
  Downloading langchain_core-0.1.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


pdf_I_used = https://rb.gy/4aero9

### Import Libraries Here

In [3]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

# Here Paste Your OpenAI API Keys

In [4]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_KEYS_HERE"

# Load Your pdf Documnet here

In [5]:
# location of the pdf file/files.
reader = PdfReader('/content/drive/MyDrive/ecoGB (1).pdf')

In [6]:
reader

<PyPDF2._reader.PdfReader at 0x7bd3209b9e40>

## Extract Textual Data from **documents**

In [7]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
# raw_text

In [8]:
raw_text[:300]

'See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/335545844\nThe impact of tou rism on local commu nities and their environment in Gilgit\nBaltistan, Pakistan: a local commu nity perspective\nArticle \xa0\xa0 in\xa0\xa0Envir onment al & Socio -eco'

## Split your text data into CHUNKS

In [11]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 700,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [12]:
len(texts)

124

In [13]:
texts[0]

'See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/335545844\nThe impact of tou rism on local commu nities and their environment in Gilgit\nBaltistan, Pakistan: a local commu nity perspective\nArticle \xa0\xa0 in\xa0\xa0Envir onment al & Socio -economic St udies  · Sept ember 2019\nDOI: 10.2478/ envir on-2019-0015\nCITATIONS\n19READS\n16,715\n8 author s, including:\nGomal Amin\nThe Hong K ong P olyt echnic Univ ersity\n9 PUBLICA TIONS \xa0\xa0\xa044 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nImran Khan\nCOMSA TS Univ ersity, Abbott abad Campus/P akist an\n28 PUBLICA TIONS \xa0\xa0\xa01,360  CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nFaridullah F aridullah\nCOMSA TS Univ ersity Islamab ad'

In [14]:
texts[1]

'SEE PROFILE\nImran Khan\nCOMSA TS Univ ersity, Abbott abad Campus/P akist an\n28 PUBLICA TIONS \xa0\xa0\xa01,360  CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nFaridullah F aridullah\nCOMSA TS Univ ersity Islamab ad\n52 PUBLICA TIONS \xa0\xa0\xa0733 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nDidar Ahmad\nCOMSA TS Univ ersity Islamab ad\n9 PUBLICA TIONS \xa0\xa0\xa033 CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Gomal Amin  on 04 Sept ember 2019.\nThe user has r equest ed enhanc ement of the do wnlo aded file.24 \n \n Environmental & Socio -economic Studies  \n \n                           \n© 201 9 Copyright by University of Silesia in Katowice  DOI:  10.2478 /environ -2019-0015 \nEnviron. Socio. -econ. Stud.,  2019, 7, 3: 24-37'

## Download Embedding OpenAI

In [15]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

## Store all embeddings vectors into StoreVector

In [16]:
docsearch = FAISS.from_texts(texts, embeddings)

In [17]:
docsearch

<langchain_community.vectorstores.faiss.FAISS at 0x7bd2fe9c6470>

In [18]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

## Load Large Language Model here

In [33]:
llm = OpenAI()
chain = load_qa_chain(llm=llm, chain_type="stuff")

Ask Query About Document

In [20]:
query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Faridullah Faridullah, Didar Ahmad, Gomal Amin'

In [21]:
query = "What is Socio-cultural impacts of tourism ?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Socio-cultural impacts of tourism refer to the effects that tourism has on local customs, socio-cultural characteristics, social life, and religious beliefs of local residents living in a community. These impacts can be both positive and negative.'

In [22]:
query = "write a summary of this article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' This article discusses the environmental, economic, and socio-cultural impacts of tourism development on local areas. Data was collected from local residents, visitors, and other stakeholders through surveys and interviews. The results showed that tourism had a positive effect on the economy, however it had a negative effect on the environment and had no significant effect on the socio-cultural state of the area. Water quality testing revealed that the drinking water from the local sources was not suitable for consumption.'

In [29]:
query = "What is Number of foreign and domestic tourists arrivals in Gilgit-Baltistan 2007-2015 Source: G.B Tourism Department?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The total number of foreign tourists in 2007 was 10,338 and the total number of domestic tourists was 123,770. In 2008, the total number of foreign tourists was 8,504 and the total number of domestic tourists was 54,040. In 2009, the total number of foreign tourists was 7,739 and the total number of domestic tourists was 54,602. In 2010, the total number of foreign tourists was 7,728 and the total number of domestic tourists was 45,300. In 2011, the total number of foreign tourists was 5,242 and the total number of domestic tourists was 61,233.'

In [30]:
query = "what is Demographic features of respondents?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The demographic features of respondents are gender ratio (68.63% men, 31.37% women) and age group (17.27% between 10-20 years, 41.36% between 20-40 years, 23.63% between 40-60 years, and 17.72% above 60 years).'

In [25]:
query = "What is conclusion of this article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)