In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

%cd /content/drive/MyDrive/projects_AI/SmartWiki





Mounted at /content/drive
/content/drive/MyDrive/projects_AI/SmartWiki


In [2]:
# Below code is to create a folder data and its been already done
# %mkdir /content/drive/MyDrive/projects_AI/SmartWiki/data
# %ls

In [3]:
# Start the code

import pandas as pd
df = pd.read_excel("data/wiki_excel.xlsx")

url = df['url'].tolist()
print(url)

['https://en.wikipedia.org/wiki/Cognizant', 'https://en.wikipedia.org/wiki/Amazon_(company)', 'https://en.wikipedia.org/wiki/Google', 'https://en.wikipedia.org/wiki/Facebook', 'https://en.wikipedia.org/wiki/Microsoft', 'https://en.wikipedia.org/wiki/Tesla,_Inc.']


In [4]:
# scraping the wiki articles and storing them as txt file in data folder for each wiki
from bs4 import BeautifulSoup
import requests
import re
import os

def scrape_wiki(url):
    r = requests.get(url, headers = {"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(r.text, "lxml")

    content_div = soup.find("div", {"id": "mw-content-text"})
    if content_div:
      paragraphs = content_div.find_all("p")
    else:
      return

    clean_text = []
    for p in paragraphs:
      text = p.get_text()
      text = re.sub(r"\[\d+\]"," ",text)
      clean_text.append(text.strip())
    article_text = "\n".join(clean_text)

    title = soup.find("title")
    file_name = title.get_text().replace(" - Wikipedia","")+".txt"
    output_file = os.path.join("data", file_name)
    with open(output_file, "w", encoding="utf-8") as file:
      file.write(article_text)


for u in url:
  scrape_wiki(u)


In [5]:
# !pip install nltk

In [5]:

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
# %mkdir /content/drive/MyDrive/projects_AI/SmartWiki/artifacts

In [8]:
# %mkdir /content/drive/MyDrive/projects_AI/SmartWiki/artifacts/chunks

In [6]:
import json
def chunk_text(text, chunk_size):
  sentences = sent_tokenize(text)
  chunks = []
  current_chunk = ""
  for s in sentences:
    if len(current_chunk)+ len(s)<=chunk_size:
      current_chunk += s+" "
    else:
      if current_chunk.strip():
        chunks.append(current_chunk.strip())
        current_chunk = s+" "

  if current_chunk.strip():
    chunks.append(current_chunk.strip())
  return chunks

def save_chunks(all_chunks, output_file):
  with open(output_file, "w", encoding="utf-8") as file:
    json.dump(all_chunks, file, indent=2, ensure_ascii = False)

all_chunks = []
for file_name in os.listdir("data"):
  if file_name.endswith(".txt"):
    doc_id = file_name.replace(".txt", "")
    input_file = os.path.join("data", file_name)
    with open(input_file, "r", encoding="utf-8") as file:
      text = file.read()
    chunks = chunk_text(text, 500)
    for i, chunk in enumerate(chunks):
      all_chunks.append({
        "doc_id": doc_id,
        "chunk_id": i,
        "text": chunk
      })
output_file = os.path.join("artifacts/chunks", "final_chunks.json")
save_chunks(all_chunks, output_file)



In [7]:
#loader
def load_chunk():
  with open("artifacts/chunks/final_chunks.json", "r", encoding="utf-8") as file:
    data = json.load(file)
  return data

chunks = load_chunk()


In [8]:
# Embeddings

from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = []
for chunk in chunks:
  vector = emb_model.encode(chunk["text"])
  embeddings.append({
      "doc_id": chunk["doc_id"],
      "chunk_id": chunk["chunk_id"],
      "embedding": vector.tolist(),
      "text": chunk["text"]
  })

  output_file = os.path.join("artifacts/embeddings", "final_embeddings.json")
  with open(output_file, "w", encoding="utf-8") as file:
    json.dump(embeddings, file, indent=2, ensure_ascii=False)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
def load_embeddings():
  with open("artifacts/embeddings/final_embeddings.json", "r", encoding="utf-8") as file:
    data = json.load(file)
  return data

embeds = load_embeddings()


In [10]:
import numpy as np

vectors = [np.array(e["embedding"], dtype ="float32") for e in embeds]
metadata = [{"doc_id":e["doc_id"], "chunk_id": e["chunk_id"], "text":e["text"]} for e in embeds]

vectors = np.vstack(vectors)
dim = vectors.shape[1]


In [11]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [12]:
# store in faiss

import faiss
index = faiss.IndexFlatL2(dim)
index.add(vectors)
faiss.write_index(index,"artifacts/faiss/my_index.faiss")

In [13]:
#save metadata

with open("artifacts/faiss/metadata.json", "w", encoding="utf-8") as file:
  json.dump(metadata, file, indent=2, ensure_ascii=False)

In [17]:
      # #testing if chunks are retrived properly

      # index = faiss.read_index("artifacts/faiss/my_index.faiss")
      # with open("artifacts/faiss/metadata.json", "r", encoding="utf-8") as file:
      #   metadata = json.load(file)

      # query = "who is the ceo of cognizant"
      # query_vec = model.encode([query]).astype("float32")

      # #search top-k

      # D,I = index.search(query_vec, 3)

      # for idx in I[0]:
      #   print(metadata[idx]["text"])
      #   print(D[0])

In [18]:
# !pip install transformers accelerate torch

In [19]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# microsoft/phi-2
# mistralai/Mistral-7B-Instruct-v0.2
model_name = "tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map ="auto", offload_folder = "offload", torch_dtype = torch.float16)

llm = pipeline("text-generation", model = model, tokenizer = tokenizer)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

Device set to use cuda:0


In [24]:
# RAG pipeline using LLM
index = faiss.read_index("artifacts/faiss/my_index.faiss")
with open("artifacts/faiss/metadata.json", "r", encoding="utf-8") as file:
  metadata = json.load(file)

query = "who is the ceo of cognizant appointed in 2023"

query_embeddings = emb_model.encode([query]).astype("float32")

scores,indices = index.search(query_embeddings,3)
context_chunks = []

for idx in indices[0]:
  context_chunks.append(metadata[idx]["text"])


prompt = f"""
you are a helpful assistant. Use the provided context to answer the question. If answer is not in the context, say "I don't know".
Context:{context_chunks[0]}
Question:{query}
Answer:
"""

#using opensource LLM from hugging face

response = llm(prompt,max_new_tokens = 200, temperature = 0.7, do_sample = True)
print(response[0]["generated_text"])




Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



you are a helpful assistant. Use the provided context to answer the question. If answer is not in the context, say "I don't know".
Context:On 1 April 2019, Francisco D'Souza was replaced by Brian Humphries as the CEO. In January 2022, Cognizant sold its acquisition Oy Samlink to Kyndryl and Mustache to DJE Holdings. In 2023, Ravi Kumar S was named as CEO of Cognizant. On 22 April 2024, Cognizant announced its partnership with Microsoft Corporation to extend its reach of Generative AI and Copilots, also to enhance experiences of employee and speed up their cross-industry innovation.
Question:who is the ceo of cognizant appointed in 2023
Answer:
Ravi Kumar S


In [22]:
%rm -rf ~/.cache/huggingface
