# 📦 Install Required Packages

In [10]:
# Install required packages
!pip install accelerate==0.21.0 transformers==4.33.1 tokenizers==0.13.3
!pip install bitsandbytes==0.40.0 einops==0.6.1
!pip install xformers==0.0.22.post7
!pip install langchain==0.1.4
!pip install faiss-gpu==1.7.1.post3
# Install SentenceTransformers from PyPI
!pip install sentence_transformers

Collecting transformers==4.33.1
  Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
Successfully installed transformers-4.33.1
Collecting sentence_transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence_transformers)
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transfor

# 📂 Load the Dataset

In [1]:
import json
import re
import numpy as np
import pandas as pd
# Load dataset
with open('/kaggle/input/news-articles/news.article.json', 'r') as f:
    articles = json.load(f)

In [3]:
first_two = articles[:2]
print(first_two)

[{'articleBody': 'Sanjay Raut, a member of the Shiv Sena (UBT) party, responded to the Maharashtra chief minister\'s statement that Eknath Shinde "himself is Hamas" and that the Shiv Sena group led by Uddhav Thackeray is capable of collaborating with "Hamas and Lashkar-e-Taiba for their own selfishness" on Wednesday by claiming that Eknath Shinde is Hamas.\n\n\n\nRaut made fun of Shinde by claiming, "He himself is Hamas. Hamas and Lashkar-e-Taiba, two terrorist groups, are completely irrelevant in Maharashtra. But the BJP is to blame for sowing the worms in their (the Shinde faction\'s) thoughts, said Raut.\n\nWhen Shinde made a statement at the Tuesday Dussehra rally in Mumbai\'s Azad Maidan, Raut reacted to it. As part of the opposition alliance INDIA, Uddhav Thackeray\'s Shiv Sena (UBT) has formed an alliance with Congress and the Samajwadi Party. Shinde remarked of this alliance: "For their own selfishness, they will tie the knot with Hamas and Lashkar-e-Taiba."\n\nRaut highlighted

In [4]:
# Clean the text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Apply cleaning to all articles
cleaned_articles = [{'content': clean_text(article['articleBody'])} for article in articles]

print(f"Total articles loaded: {len(cleaned_articles)}")

Total articles loaded: 37421


In [5]:
first_two = cleaned_articles[:2]
print(first_two)

[{'content': 'Sanjay Raut a member of the Shiv Sena UBT party responded to the Maharashtra chief ministers statement that Eknath Shinde himself is Hamas and that the Shiv Sena group led by Uddhav Thackeray is capable of collaborating with Hamas and LashkareTaiba for their own selfishness on Wednesday by claiming that Eknath Shinde is Hamas Raut made fun of Shinde by claiming He himself is Hamas Hamas and LashkareTaiba two terrorist groups are completely irrelevant in Maharashtra But the BJP is to blame for sowing the worms in their the Shinde factions thoughts said Raut When Shinde made a statement at the Tuesday Dussehra rally in Mumbais Azad Maidan Raut reacted to it As part of the opposition alliance INDIA Uddhav Thackerays Shiv Sena UBT has formed an alliance with Congress and the Samajwadi Party Shinde remarked of this alliance For their own selfishness they will tie the knot with Hamas and LashkareTaiba Raut highlighted that Shindes address differed from the customary Dussehra ra

# 🔍 Filter Relevant Articles

In [6]:
def filter_relevant_articles(articles, keyword='Israel Hamas war'):
    relevant_articles = [article for article in articles if keyword.lower() in article['content'].lower()]
    return relevant_articles

relevant_articles = filter_relevant_articles(cleaned_articles)

print(f"Total relevant articles: {len(relevant_articles)}")

Total relevant articles: 79


# 🤖 Setup and Load the LLM

In [7]:
from torch import cuda, bfloat16
import transformers
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# hf_auth = '<add your access token here>'
hf_auth = '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'
model_config = AutoConfig.from_pretrained(model_id, use_auth_token=hf_auth)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

print(f"Model loaded on {device}")




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
2024-06-10 13:29:05.114442: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-10 13:29:05.114566: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-10 13:29:05.244972: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Model loaded on cuda:0


# 🛑 Define Custom Stopping Criteria


In [8]:
from transformers import StoppingCriteria, StoppingCriteriaList

stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

# 📝 Initialize Text Generation Pipeline

In [9]:
from transformers import pipeline

generate_text = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # Langchain expects the full text
    task='text-generation',
    stopping_criteria=stopping_criteria,
    temperature=0.1,  # Randomness of outputs
    max_new_tokens=512,  # Max number of tokens to generate
    repetition_penalty=1.1  # To prevent output repetition
)

# Test the pipeline
res = generate_text("What happened at the Al-Shifa Hospital?")
print(res[0]["generated_text"])

What happened at the Al-Shifa Hospital?
 nobody knows. The Sudanese government has been tight-lipped about the incident, and there have been conflicting reports about what exactly happened. Some sources say that the attack was carried out by a group of armed men who entered the hospital and opened fire on patients and medical staff. Others claim that the attack was staged as part of a larger conspiracy to discredit the Sudanese government. Whatever the truth may be, it is clear that the attack on the Al-Shifa Hospital was a horrific act of violence that left many innocent people dead or injured. It is important that we continue to investigate this incident and hold those responsible accountable for their actions.


# 🔗 Integrate with LangChain

In [10]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# Test LangChain integration
response = llm(prompt="What happened at the Al-Shifa Hospital?")
print(response)

  warn_deprecated(



 Unterscheidung zwischen "Al-Shifa" und "Al Shifa"

The Al-Shifa Hospital is a major medical facility located in the capital city of Riyadh, Saudi Arabia. On April 14, 2015, a fire broke out at the hospital, resulting in significant damage to the building and loss of life. The exact cause of the fire is still under investigation, but it is believed to have been caused by an electrical malfunction.

The incident was widely reported in the local media, with many news outlets providing updates on the situation. According to reports, the fire started in the hospital's intensive care unit (ICU) and quickly spread to other areas of the building. Firefighters were able to contain the blaze and prevent it from spreading to other parts of the city, but not before several people had died and many more were injured.

In the aftermath of the fire, there were concerns about the safety of patients and staff at the hospital. However, officials assured the public that all necessary measures were bein

# 🌟 Retrieval of Data from Websites


In [13]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain

web_links = ["https://en.wikipedia.org/wiki/Israel%E2%80%93Hamas_war", "https://edition.cnn.com/middleeast/live-news/israel-hamas-war-gaza-news-06-09-24/index.html", "https://en.wikipedia.org/wiki/Al-Shifa_Hospital_siege#:~:text=After%20a%20two%20week%20siege,hospital%2C%20including%20in%20mass%20graves."]

loader = WebBaseLoader(web_links)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}  # Use CPU instead of CUDA

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

chat_history = []

query = "What happened at the Al-Shifa Hospital?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  warn_deprecated(


 Based on the provided text, it appears that the Israeli military launched a raid on the Al-Shifa Hospital, resulting in the deaths of patients and medical staff, as well as the destruction of the hospital. Witnesses reported seeing Israeli forces firing at the hospital and detaining medical staff and patients. The World Health Organization reported that 21 patients had died during the raid, and Israeli forces claimed to have found weapons hidden in patients' pillows and beds. The hospital was left with blown out windows and blackened concrete walls after the Israeli forces withdrew.
