# handeled the pdf

In [4]:
from langchain.document_loaders import PyPDFLoader
pdf = PyPDFLoader("https://gemconsortium.org/file/open?fileId=51377")
pages = pdf.load()

# handeled the three links

In [5]:
import pandas as pd
df1 = pd.read_csv("API_IT.NET.USER.ZS_DS2_en_csv_v2_2337.csv", skiprows=4)
df2 = pd.read_csv("API_SI.POV.DDAY_DS2_en_csv_v2_3790.csv", skiprows=4)
df3 = pd.read_csv("API_SL.UEM.TOTL.ZS_DS2_en_csv_v2_149.csv", skiprows=4)

# processing the data to make it sutiable documents for RAG

In [6]:
import pandas as pd

def replace_null_with_no_number(df, start_year, end_year):
    """
    Replace NaN values in columns from start_year to end_year with 'no number available'.

    Parameters:
    df (pd.DataFrame): The dataframe to process.
    start_year (int): The starting year.
    end_year (int): The ending year.

    Returns:
    pd.DataFrame: The updated dataframe.
    """
    for year in range(start_year, end_year + 1):
        col_name = str(year)
        if col_name in df.columns:
            df[col_name].fillna('not available', inplace=True)
    return df



# Replace NaN values for all columns from 1960 to 2023 in df1
df1 = replace_null_with_no_number(df1, 1960, 2023)
df2 = replace_null_with_no_number(df2, 1960, 2023)
df3 = replace_null_with_no_number(df3, 1960, 2023)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col_name].fillna('not available', inplace=True)
  df[col_name].fillna('not available', inplace=True)


In [7]:
import pandas as pd

def create_country_indicator_list(df, start_year, end_year, country_column, code_column, indicator_column):
    """
    Create a list of strings (Country\nCountry Code\nIndicator\nYear\nValue) for each row.

    Parameters:
    df (pd.DataFrame): The dataframe containing data.
    start_year (int): The starting year.
    end_year (int): The ending year.
    country_column (str): The column name for country.
    code_column (str): The column name for country code.
    indicator_column (str): The column name for the indicator.

    Returns:
    list: A list of strings formatted as 'Country Code\nCountry\nIndicator\nYear\nValue'.
    """
    result_list = []
    for _, row in df.iterrows():
        # Check if necessary columns exist in the current row
        if pd.notnull(row[country_column]) and pd.notnull(row[code_column]) and pd.notnull(row[indicator_column]):
            for year in range(start_year, end_year + 1):
                col_name = str(year)
                if col_name in df.columns and pd.notnull(row[col_name]):
                    result_list.append(f"{row[indicator_column]} in {row[country_column]} at {year} are {row[col_name]}")
    return result_list

# Create the string
df1_strings = create_country_indicator_list(df1, 1960, 2023, "Country Name", "Country Code", "Indicator Name")
df2_strings = create_country_indicator_list(df2, 1960, 2023, "Country Name", "Country Code", "Indicator Name")
df3_strings = create_country_indicator_list(df3, 1960, 2023, "Country Name", "Country Code", "Indicator Name")



In [8]:
df1_strings

['Individuals using the Internet (% of population) in Aruba at 1960 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1961 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1962 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1963 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1964 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1965 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1966 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1967 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1968 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1969 are not available',
 'Individuals using the Internet (% of population) in Aruba at 1970 are not available',
 'Individuals using the Internet

In [9]:
from langchain_core.documents import Document

def convert_to_documents(strings_list):
    """
    Convert a list of strings into a list of Document objects.

    Parameters:
    strings_list (List[str]): A list of strings.

    Returns:
    List[Document]: A list of Document objects.
    """
    return [Document(page_content=s) for s in strings_list]

In [10]:
df1_docs = convert_to_documents(df1_strings)
df2_docs = convert_to_documents(df2_strings)
df3_docs = convert_to_documents(df3_strings)

# adding the datasets together with the pdf

In [11]:
docs_all = df1_docs + df2_docs + df3_docs + pages

# loading the default Embeddings model

In [13]:
from langchain_huggingface import HuggingFaceEmbeddings

hf = HuggingFaceEmbeddings()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# creating the vector database with documents and Embeddings model

In [14]:
from langchain.vectorstores import Chroma

In [15]:
persist_directory = 'docs/chroma/'

In [None]:
# !rm -rf ./docs/chroma  # remove old database files if any

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
vectordb = Chroma.from_documents(
    documents=docs_all,
    embedding=hf,
    persist_directory=persist_directory
)

# testing the retrival

In [18]:
question = "what is Individuals using the Internet (% of population) in Azerbaijan at 1998?"
docs = vectordb.similarity_search(question,k=1)


In [19]:
print(docs[0].page_content)

Individuals using the Internet (% of population) in Azerbaijan at 1998 are 0.037485109


# trying the language model and chatbot memory

In [20]:
from langchain_huggingface import HuggingFacePipeline
llm = HuggingFacePipeline.from_model_id(
    model_id="tiiuae/Falcon3-1B-Instruct",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 1000},
    device = 0
)

tokenizer_config.json:   0%|          | 0.00/364k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Device set to use cuda:0


In [21]:
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

persist_directory = 'docs/chroma/'
embedding = HuggingFaceEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

  vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)


In [26]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

  memory = ConversationBufferMemory(


In [40]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever(search_kwargs={"k":1})
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory,
)

In [41]:
question = "what is Individuals using the Internet (% of population) in Azerbaijan at 1998?"

In [42]:
result = qa({"question": question})

In [None]:
result['answer']

In [52]:
output = result["answer"]
answer = output.split("Helpful Answer:")[-1].strip()

In [53]:
answer

"Yes, I'm sure of that."