In [10]:
import os
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import re
from transformers import BertTokenizer, BertModel
import torch
import faiss
import numpy as np

**Parsing Stanford LLMs Lecture Notes**

In [11]:
def preprocess_text(text):
    # Remove LaTeX commands enclosed within \(\) and $$
    text = re.sub(r'\\\((.*?)\\\)', '', text)
    text = re.sub(r'\$\$(.*?)\$\$', '', text)
    # Remove LaTeX commands enclosed within \[ and \]
    text = re.sub(r'\\\[(.*?)\\\]', '', text)
    # Remove other LaTeX commands with backslashes
    text = re.sub(r'\\([^\\]+){(.*?)}', '', text)
    # Remove LaTeX commands without backslashes
    text = re.sub(r'\\(\w+)', '', text)
    return text

# Function to parse lecture notes from a single chapter page
def parse_chapter_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find the HTML element(s) containing the chapter text
        # You need to inspect the HTML structure of the page to identify the correct element(s)
        chapter_text_elements = soup.find_all('div', class_='main-content-wrap')  # Example: <div class="chapter-text">
        # Extract text from the chapter text elements
        chapter_text = '\n'.join([element.get_text() for element in chapter_text_elements])
        # Preprocess the chapter text to remove LaTeX commands
        chapter_text = preprocess_text(chapter_text)
        # Remove extra whitespace and newline characters
        chapter_text = ' '.join(chapter_text.split())
        return chapter_text
    else:
        print(f"Failed to fetch data from {url}")
        return None



# Function to parse lecture notes from the main page containing links to chapters
def parse_main_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all links to individual chapter pages
        chapter_links = soup.find_all('a', href=True)
        # Extract URLs of individual chapter pages excluding links with #
        chapter_urls = [link['href'] for link in chapter_links if '#' not in link['href']]
        return chapter_urls
    else:
        print(f"Failed to fetch data from {url}")
        return []


# Main function to parse lecture notes from all chapters
def parse_all_lecture_notes(main_url):
    # Parse main page to get URLs of individual chapter pages
    chapter_urls = parse_main_page(main_url)
    if not chapter_urls:
        print("No chapter URLs found.")
        return

    # Parse lecture notes from each chapter page
    lecture_notes = {}
    for url in chapter_urls:
        print(url)
        chapter_text = parse_chapter_page(url)
        if chapter_text:
            lecture_notes[url] = chapter_text

    return lecture_notes

# URL of the main page containing links to chapters
main_page_url = "https://stanford-cs324.github.io/winter2022/lectures/"

# Parse lecture notes from all chapters
parsed_lecture_notes = parse_all_lecture_notes(main_page_url)

# Display the parsed lecture notes (for demonstration purposes)
for url, text in parsed_lecture_notes.items():
    print(f"Chapter URL: {url}")
    print(f"Chapter Text: {text[:200]}...")  # Displaying a portion of the text
    print("--------------------------")

chapter_text_data = parsed_lecture_notes

https://stanford-cs324.github.io/winter2022/
https://stanford-cs324.github.io/winter2022/
https://stanford-cs324.github.io/winter2022/calendar/
https://stanford-cs324.github.io/winter2022/lectures/
https://stanford-cs324.github.io/winter2022/lectures/introduction/
https://stanford-cs324.github.io/winter2022/lectures/capabilities/
https://stanford-cs324.github.io/winter2022/lectures/harms-1/
https://stanford-cs324.github.io/winter2022/lectures/harms-2/
https://stanford-cs324.github.io/winter2022/lectures/data/
https://stanford-cs324.github.io/winter2022/lectures/security/
https://stanford-cs324.github.io/winter2022/lectures/legality/
https://stanford-cs324.github.io/winter2022/lectures/modeling/
https://stanford-cs324.github.io/winter2022/lectures/training/
https://stanford-cs324.github.io/winter2022/lectures/parallelism/
https://stanford-cs324.github.io/winter2022/lectures/scaling-laws/
https://stanford-cs324.github.io/winter2022/lectures/selective-architectures/
https://stanford-cs324

**Parsing the table of model architectures**

In [12]:

# URL of the GitHub readme file
git_url = "https://github.com/Hannibal046/Awesome-LLM#milestone-papers"

# Send a GET request to the URL
response = requests.get(git_url)

# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")

article = soup.find("article")
# Find the table containing milestone papers within the <article> tag
table = article.find("table")

# Extract table rows
rows = table.find_all("tr")

# Extract data from each row
milestone_papers = []
for row in rows:
    columns = row.find_all("td")
    if len(columns) >= 3:  # Check if the row contains at least three columns
        paper = {

            "Date": columns[0].text.strip(),
            "Title": columns[3].text.strip(),
            "Authors": columns[2].text.strip(),
            "Conference": columns[4].text.strip(),
        }
        milestone_papers.append(paper)

# Print milestone papers
for paper in milestone_papers:
    print("Date:", paper["Date"])
    print("Title:", paper["Title"])
    print("Authors:", paper["Authors"])
    print("Conference:", paper["Conference"])
    print()

Date: 2017-06
Title: Attention Is All You Need
Authors: Google
Conference: NeurIPS

Date: 2018-06
Title: Improving Language Understanding by Generative Pre-Training
Authors: OpenAI
Conference: 

Date: 2018-10
Title: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
Authors: Google
Conference: NAACL

Date: 2019-02
Title: Language Models are Unsupervised Multitask Learners
Authors: OpenAI
Conference: 

Date: 2019-09
Title: Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism
Authors: NVIDIA
Conference: 

Date: 2019-10
Title: Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
Authors: Google
Conference: JMLR

Date: 2019-10
Title: ZeRO: Memory Optimizations Toward Training Trillion Parameter Models
Authors: Microsoft
Conference: SC

Date: 2020-01
Title: Scaling Laws for Neural Language Models
Authors: OpenAI
Conference: 

Date: 2020-05
Title: Language models are few-shot learners
Authors: OpenAI


Setting Up the Environment Variable

In [15]:
# !pip install langchain --upgrade
# !pip install langchain-experimental
# !pip install langchain-openai
# !pip install beautifulsoup4
# !pip install faiss-cpu
os.environ['OPENAI_API_KEY']="your-api-key"
# "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"
#"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"
#"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" #"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"

In [16]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage


my_list = list(chapter_text_data.values())
llm = ChatOpenAI()
embeddings = OpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter()

# Create a Document object
docs = []
for link, text in chapter_text_data.items():
    doc = Document(page_content=text, metadata={"link": link})
    docs.append(doc)

for paper in milestone_papers:
    paper_text = f"Date: {paper['Date']}\nTitle: {paper['Title']}\nAuthors: {paper['Authors']}\nConference: {paper['Conference']}"
    doc = Document(page_content=paper_text , metadata={"link": git_url})
    docs.append(doc)

documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [17]:
retriever = vector.as_retriever()
# First we need a prompt that we can pass into an LLM to generate this search query
prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
])

retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer the user's questions based on the below context:\n\n{context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
    # MessagesPlaceholder(variable_name="context"),
])
document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)
chat_history = [HumanMessage(content="What are LLM models used for?"), AIMessage(content="Text Generation and Language Translation")]

**Example 1:**

In [18]:
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "What are some milestone model architectures and papers in the last few years?",
})

In [37]:
## GETTING CITATIONS
for res in response["context"]:
    print(res.metadata.get('link'))

https://github.com/Hannibal046/Awesome-LLM#milestone-papers
https://github.com/Hannibal046/Awesome-LLM#milestone-papers
https://stanford-cs324.github.io/winter2022/
https://github.com/Hannibal046/Awesome-LLM#milestone-papers


In [117]:
## GETTING RESPONSE
print(response["answer"])

Some milestone model architectures and papers in the last few years include:

1. BERT (Bidirectional Encoder Representations from Transformers) - Devlin et al., 2018
2. GPT-2 (Generative Pre-trained Transformer 2) - Radford et al., 2019
3. T5 (Text-to-Text Transfer Transformer) - Raffel et al., 2019
4. GPT-3 (Generative Pre-trained Transformer 3) - Brown et al., 2020
5. RoBERTa (A Robustly Optimized BERT Approach) - Liu et al., 2019

These models have significantly advanced the field of natural language processing and have been instrumental in various NLP tasks and applications.


**Bonus Part 1: Citations:**

In [38]:
## COMBINED FUNCTION WHICH TAKES IN QUERY AND GENERATES THE RESPONSE
def get_response(query):
  response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": query,
  })
  chat_history.append(HumanMessage(content=query))
  chat_history.append(AIMessage(content=response["answer"]))
  print("Response...\n")
  print(response["answer"])
  print("citations..\n")
  for res in response["context"]:
    print(res.metadata.get('link'))

**Example 2:**

In [118]:
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "What are the layers in a transformer block?",
})

In [119]:
print(response["answer"])

In a transformer block, there are typically two main layers:

1. Self-Attention Layer: This layer allows each token in the sequence to interact with every other token in the sequence, capturing dependencies and relationships between different parts of the input sequence.

2. Feed-Forward Layer: This layer processes each token independently through a series of fully connected neural networks, providing non-linear transformations to the input embeddings.


**Example 3:**

In [120]:
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "Tell me about datasets used to train LLMs and how they’re cleaned.",
})

In [121]:
print(response["answer"])

Large language models (LLMs) are trained on datasets that contain vast amounts of text data from various sources like the web, news, Wikipedia, and fiction. These datasets are cleaned to remove noise, irrelevant information, and potentially harmful content. For example, the WebText dataset used to train GPT-2 was created by scraping outbound links with upvotes, filtering out Wikipedia content, and ensuring high-quality diverse text. OpenAI's OpenWebText dataset was created by extracting URLs from Reddit submissions, filtering out non-English text, and removing duplicates. The Colossal Clean Crawled Corpus (C4) used to train the T5 model involved removing "bad words," code, and non-English text. Cleaning the datasets helps improve the quality and reliability of the training data for LLMs.


In [122]:
chat_history.append(HumanMessage(content="Tell me about datasets used to train LLMs and how they’re cleaned."))

**Bonus Part 2:**

**Example 4:(Demonstrating follow ups)**

In [125]:
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "What are some challenges encountered during cleaning of datasets, and how are these challenges addressed?",
})

In [126]:
print(response["answer"])

During the cleaning of datasets used to train Large Language Models (LLMs), several challenges can be encountered. Some of these challenges include:

1. **Data Quality**: Ensuring that the data is of high quality and relevant to the task at hand. Low-quality or irrelevant data can negatively impact the performance of the model.

2. **Data Bias**: Datasets may contain biases that can lead to discriminatory or unfair outcomes when the model is deployed. Addressing bias in the data is crucial to ensure fair and equitable results.

3. **Data Privacy**: Protecting sensitive information present in the data is essential to maintain privacy and confidentiality. Anonymizing or removing personally identifiable information is often necessary.

4. **Data Duplication**: Duplicates in the dataset can skew the model's training and evaluation. Removing duplicates or handling them appropriately is important for accurate training.

5. **Data Size**: Large datasets can be challenging to clean efficiently

In [135]:
response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": "Thanks for answering.",
})

In [136]:
response["answer"]

"You're welcome! If you have any more questions or need further information, feel free to ask. I'm here to help!"

**Interactive Conversation**

In [143]:
flag=1
conversation_history = []
for history in chat_history:
  if(type(history)==HumanMessage):
    conversation_history.append("User : "+history)
  elif(type(history)==AIMessage):
    conversation_history.append("AI : "+history)
  else:
    conversation_history.append("User : "+history)

while(flag):
  query = input()
  if query=='-1':
    break
  response = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": query,
  })
  chat_history.append(query)
  print("Question asked: \n", query)
  conversation_history.append("User : "+query)
  conversation_history.append("AI : "+response["answer"])
  print(response["answer"])

What are LLM's used for? 
Question asked: 
What are LLM's used for?
Large Language Models (LLMs) are used for various natural language processing tasks such as text generation, language translation, sentiment analysis, question answering, summarization, and more. They are capable of understanding and generating human-like text based on the patterns and data they have been trained on.
Can LLMs summarise data?
Question asked: 
Can LLMs summarise data? 
Yes, Large Language Models (LLMs) are capable of summarizing data. They can be used for text summarization tasks where they condense longer pieces of text into shorter summaries while retaining the key information and meaning. This ability makes them useful for tasks like creating abstracts of articles, condensing reports, or generating brief overviews of documents.
Which LLM should I use for my work?
Question asked: 
Which LLM should I use for my work? 
It depends on the specific natural language processing task you need to perform. Diffe

**Bonus Part 3:**

**Conversation Summarizer**

In [9]:
# Load the summarization pipeline
summarizer = pipeline("summarization")

# Concatenate conversation history into a single text
conversation_text = "\n".join(conversation_history)

# Generate summary of conversation
summary = summarizer(conversation_text, max_length=100, min_length=30, num_beams=4, early_stopping=True)

# Print the summary
print("Summary of conversation:")
print(summary[0]['summary_text'])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Summary of conversation:
 Large Language Models (LLMs) are used for various natural language processing tasks such as text generation, language translation, sentiment analysis, question answering, summarization, and more . Can be used for text summarization tasks where they condense longer pieces of text into shorter summaries while retaining the key information and meaning . Different LLMs have been trained and specialized for various tasks .
