In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import glob

base_url = "https://stanford-cs324.github.io/winter2022/lectures/"

def get_title_from_url(url):
    parsed_url = urlparse(url)
    path_components = parsed_url.path.split('/')
    if len(path_components) >= 3:
        return path_components[-2]
    return None

def scrape_lecture_notes(base_url):
    try:
        lecture_notes = []

        response = requests.get(base_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        main_content_div = soup.find('div', id='main-content')

        if main_content_div:
            lecture_links = main_content_div.find_all('a', href=True)

            output_folder = r"E:\Assignment2\data\lecture_notes"
            os.makedirs(output_folder, exist_ok=True)

            existing_files = glob.glob(os.path.join(output_folder, '*.txt'))
            for file in existing_files:
                os.remove(file)

            for idx, link in enumerate(lecture_links, start=1):
                lecture_url = urljoin(base_url, link['href'])
                lecture_title = get_title_from_url(lecture_url)

                lecture_response = requests.get(lecture_url)
                lecture_response.raise_for_status()

                lecture_soup = BeautifulSoup(lecture_response.content, 'html.parser')

                lecture_main_content = lecture_soup.find('div', id='main-content')

                if lecture_main_content:
                    lecture_content = lecture_main_content.text.strip()

                    filename = os.path.join(output_folder, f"{lecture_title}.txt")
                    with open(filename, 'w', encoding='utf-8') as f:
                        f.write(f"Lecture: {lecture_title}\n\n")
                        f.write(lecture_content)
                    
                    lecture_notes.append((lecture_title, lecture_content))
                else:
                    print(f"Warning: No main content found for lecture: {lecture_url}")
        else:
            print(f"Error: No main content div found on the page: {base_url}")

        return lecture_notes

    except Exception as e:
        print(f"Error fetching lecture notes: {e}")
        return []

lecture_notes = scrape_lecture_notes(base_url)



<generator object DataFrame.iterrows at 0x00000197384CFA50>


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

github_url = "https://github.com/Hannibal046/Awesome-LLM#milestone-papers"

def scrape_milestone_papers(github_url, save_path):
    try:
        response = requests.get(github_url)
        response.raise_for_status()  # Raise an exception for bad response status

        soup = BeautifulSoup(response.content, 'html.parser')

        tables = soup.find_all('table')

        milestone_table = None
        for table in tables:
            if not table.has_attr('class') and not table.has_attr('id'):
                milestone_table = table
                break

        if milestone_table:
            df = pd.read_html(str(milestone_table))[0]

            df.columns = ['Date', 'Keywords', 'Institute', 'Paper', 'Publication']

            df = df.iloc[1:]

            if os.path.exists(save_path):
                os.remove(save_path)
                print(f"Deleted existing file: {save_path}")

            output_folder = os.path.dirname(save_path)
            os.makedirs(output_folder, exist_ok=True)
            df.to_csv(save_path, index=False)
            print(f"Milestone papers table saved as {save_path}")

            return df
        else:
            print("Error: No suitable table found on the page.")
            return None

    except Exception as e:
        print(f"Error fetching milestone papers: {e}")
        return None

save_path = r"E:\Assignment2\data\milestone_papers.csv"
milestone_papers_df = scrape_milestone_papers(github_url, save_path)
if milestone_papers_df is not None:
    print("Milestone Papers Table:")
    print(milestone_papers_df.head())


Deleted existing file: E:\Assignment2\data\milestone_papers.csv
Milestone papers table saved as E:\Assignment2\data\milestone_papers.csv
Milestone Papers Table:
      Date     Keywords Institute  \
1  2018-06      GPT 1.0    OpenAI   
2  2018-10         BERT    Google   
3  2019-02      GPT 2.0    OpenAI   
4  2019-09  Megatron-LM    NVIDIA   
5  2019-10           T5    Google   

                                               Paper Publication  
1  Improving Language Understanding by Generative...         NaN  
2  BERT: Pre-training of Deep Bidirectional Trans...       NAACL  
3  Language Models are Unsupervised Multitask Lea...         NaN  
4  Megatron-LM: Training Multi-Billion Parameter ...         NaN  
5  Exploring the Limits of Transfer Learning with...        JMLR  


Error extracting PDF text from Improving Language Understanding by Generative Pre-Training: Invalid URL 'Improving Language Understanding by Generative Pre-Training': No scheme supplied. Perhaps you meant https://Improving Language Understanding by Generative Pre-Training?
Saved text from 'Improving Language Understanding by Generative Pre-Training' PDF as 'Improving Language Understanding by Generative Pre-Training.txt'
Error extracting PDF text from BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding: No connection adapters were found for 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding'
Saved text from 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding' PDF as 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.txt'
Error extracting PDF text from Language Models are Unsupervised Multitask Learners: Invalid URL 'Language Models are Unsupervised Multitask Learners

In [14]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse

github_url = "https://github.com/Hannibal046/Awesome-LLM#milestone-papers"

def scrape_milestone_papers(github_url):
    try:
        response = requests.get(github_url)
        response.raise_for_status() 

        soup = BeautifulSoup(response.content, 'html.parser')

        tables = soup.find_all('table')

        milestone_table = None
        for table in tables:
            if not table.has_attr('class') and not table.has_attr('id'):
                milestone_table = table
                break

        if milestone_table:
            df = pd.read_html(str(milestone_table))[0]

            paper_links = extract_links_from_paper_column(df, 'Paper')

            return paper_links
        else:
            print("Error: No suitable table found on the page.")
            return None

    except Exception as e:
        print(f"Error fetching milestone papers: {e}")
        return None

def extract_links_from_paper_column(df, column_name):
    links = []
    for index, row in df.iterrows():
        paper_data = row[column_name]
        soup = BeautifulSoup(paper_data, 'html.parser')
        for link in soup.find_all('a', href=True):
            links.append(link['href'])
    return links

paper_links = scrape_milestone_papers(github_url)
if paper_links:
    print("Extracted Paper Links:")
    for link in paper_links:
        print(link)


In [6]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from PyPDF2 import PdfReader

github_url = "https://github.com/Hannibal046/Awesome-LLM#milestone-papers"

def scrape_milestone_papers(github_url, save_folder):
    try:
        response = requests.get(github_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        tables = soup.find_all('table')

        milestone_table = None
        for table in tables:
            if not table.has_attr('class') and not table.has_attr('id'):
                milestone_table = table
                break

        if milestone_table is None:
            raise Exception("Error: No suitable table found in milestone papers section.")

        df = pd.read_html(str(milestone_table))[0]
        df.columns = ['Date', 'Keywords', 'Institute', 'Paper', 'Publication']
        df = df.iloc[1:]

        os.makedirs(save_folder, exist_ok=True)
        for filename in os.listdir(save_folder):
            file_path = os.path.join(save_folder, filename)
            try:
                if filename.endswith(".txt"):
                    os.remove(file_path)
                    print(f"Deleted existing file: {file_path}")
            except Exception as e:
                print(f"Error deleting file {file_path}: {e}")

        all_links = milestone_table.find_all('a', href=True)

        valid_href_values = []
        for link in all_links:
            if 'target' in link.attrs and link['target'] == '_blank':
                continue
            valid_href_values.append(link['href'])

        print(f"All links found in the milestone papers table:")
        print(valid_href_values)

        for idx, link in enumerate(valid_href_values):
            pdf_text = extract_pdf_text(link)

            keyword = df.iloc[idx]['Keywords']
            if not keyword:
                keyword = f"pdf{idx + 1}"

            txt_filename = f"{keyword}.txt"
            txt_filepath = os.path.join(save_folder, txt_filename)
            with open(txt_filepath, 'w', encoding='utf-8') as f:
                f.write(pdf_text)
            print(f"Saved text from PDF {idx + 1} as '{txt_filename}'")

        return df

    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred: {e}")
    except requests.exceptions.RequestException as e:
        print(f"Request exception occurred: {e}")
    except Exception as e:
        print(f"Error fetching milestone papers: {e}")

    return None

def extract_pdf_text(pdf_url):
    try:
        response = requests.get(pdf_url, stream=True)
        response.raise_for_status()

        temp_pdf_path = 'temp.pdf'
        with open(temp_pdf_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

        pdf_text = ''
        with open(temp_pdf_path, 'rb') as f:
            pdf_reader = PdfReader(f)
            for page in pdf_reader.pages:
                pdf_text += page.extract_text()

        os.remove(temp_pdf_path)

        return pdf_text.strip()

    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred while downloading PDF from {pdf_url}: {e}")
        return ''
    except requests.exceptions.RequestException as e:
        print(f"Request exception occurred while downloading PDF from {pdf_url}: {e}")
        return ''
    except Exception as e:
        print(f"Error extracting PDF text from {pdf_url}: {e}")
        return ''

# Example usage
save_folder = r"E:\Assignment2\data\milestone_papers_text"
milestone_papers_df = scrape_milestone_papers(github_url, save_folder)
if milestone_papers_df is not None:
    print("Milestone Papers Table:")
    print(milestone_papers_df.head())


Deleted existing file: E:\Assignment2\data\milestone_papers_text\BERT.txt
Deleted existing file: E:\Assignment2\data\milestone_papers_text\GPT 1.0.txt
All links found in the milestone papers table:
['https://arxiv.org/pdf/1706.03762.pdf', 'https://www.cs.ubc.ca/~amuham01/LING530/papers/radford2018improving.pdf', 'https://aclanthology.org/N19-1423.pdf', 'https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf', 'https://arxiv.org/pdf/1909.08053.pdf', 'https://jmlr.org/papers/v21/20-074.html', 'https://arxiv.org/pdf/1910.02054.pdf', 'https://arxiv.org/pdf/2001.08361.pdf', 'https://papers.nips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf', 'https://arxiv.org/pdf/2101.03961.pdf', 'https://arxiv.org/pdf/2107.03374.pdf', 'https://arxiv.org/pdf/2108.07258.pdf', 'https://openreview.net/forum?id=gEZrGCozdqR', 'https://arxiv.org/abs/2110.08207', 'https://arxiv.org/pdf/2112.06905.pdf', 'https://www.semanticscholar.org/pap

In [2]:
pip install -U langchain-community


Collecting langchain-community
  Using cached langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Using cached marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Using cached typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Using cached mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Using cached langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
Using cached dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Using cached marshmallow-3.21.3-py3-none-any.whl (49 kB)
Using cached typing_inspect-0.9.0-py3-none-any.whl (8.

In [5]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [13]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import SequentialChain, LLMChain
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate

# Ensure the environment variable is set
api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
if not api_key:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable not set")

# Initialize LLM with API key
llm = HuggingFaceHub(
    repo_id='google/flan-t5-small', 
    model_kwargs={'temperature': 0.7, 'max_length': 512}
)

# Define prompts for each step in the sequential chain
preprocess_prompt = PromptTemplate.from_template("Preprocess the document chunk to extract key information: {chunk}")
analysis_prompt = PromptTemplate.from_template("Analyze the key information from the preprocessed chunk: {key_info}")

# Create the SequentialChain
sequential_chain = SequentialChain(
    chains=[
        LLMChain(llm=llm, prompt=preprocess_prompt, output_key="key_info"),
        LLMChain(llm=llm, prompt=analysis_prompt, output_key="analysis")
    ],
    
    input_variables=["chunk"],
    output_variables=["analysis"]
)


In [14]:

def load_documents(directory_path):
    loader = DirectoryLoader(directory_path)
    documents = loader.load()
    return [doc.page_content for doc in documents]  # Access the content attribute directly

def preprocess_documents(documents, chunk_size=512, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for doc in documents:
        chunks.extend(splitter.split_text(doc))
    return chunks


def analyze_query(doc_chunks, query):
    results = []
    for chunk in doc_chunks:
        analysis = sequential_chain({"chunk": chunk})
        results.append(analysis['analysis'])
    # Aggregate the results
    integrated_response = aggregate_results(results, query)
    return integrated_response

def aggregate_results(results, query):
    # Integrate the results to form a coherent response to the query
    integrated_response = " ".join(results)
    return integrated_response


In [16]:

def main():
    # Load and preprocess documents
    documents = load_documents('E:/Assignment2/data/milestone_papers_text')
    doc_chunks = preprocess_documents(documents)

    # Query to be answered
    query = "What are some milestone model architectures and papers in the last few years?"

    # Analyze query using the document chunks
    response = analyze_query(doc_chunks, query)
    print(f"Response: {response}")

if __name__ == "__main__":
    main()


HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://api-inference.huggingface.co/models/google/flan-t5-small (Request ID: 1AgQcmt0E3Pifuo02sZlM)

Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate

In [None]:
from langchain.chains import SequentialChain
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate

# Initialize LLM
llm = HuggingFaceHub(repo_id='google/flan-t5-small', model_kwargs={'temperature': 0.7, 'max_length': 512})

# Define prompts for each step in the sequential chain
preprocess_prompt = PromptTemplate("Preprocess the document chunk to extract key information: {chunk}")
analysis_prompt = PromptTemplate("Analyze the key information from the preprocessed chunk: {key_info}")

# Create the SequentialChain
sequential_chain = SequentialChain(
    chains=[
        LLMChain(llm=llm, prompt=preprocess_prompt, output_key="key_info"),
        LLMChain(llm=llm, prompt=analysis_prompt, output_key="analysis")
    ],
    input_key="chunk",
    output_key="final_analysis"
)


In [None]:
def analyze_query(doc_chunks, query):
    results = []
    for chunk in doc_chunks:
        analysis = sequential_chain({"chunk": chunk})
        results.append(analysis['final_analysis'])
    # Aggregate the results
    integrated_response = aggregate_results(results, query)
    return integrated_response

def aggregate_results(results, query):
    # Integrate the results to form a coherent response to the query
    integrated_response = " ".join(results)
    return integrated_response

# Example usage
def main():
    # Load and preprocess documents
    documents = load_documents('E:\Assignment2\data\milestone_papers_text')
    doc_chunks = preprocess_documents(documents)

    # Query to be answered
    query = "What are some milestone model architectures and papers in the last few years?"

    # Analyze query using the document chunks
    response = analyze_query(doc_chunks, query)
    print(f"Response: {response}")

if __name__ == "__main__":
    main()


In [47]:
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_documents(directory_path):
    loader = DirectoryLoader(directory_path)
    documents = loader.load()
    return [doc.page_content for doc in documents]

def preprocess_documents(documents, chunk_size=512, overlap=50):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for doc in documents:
        chunks.extend(splitter.split_text(doc))
    return chunks


In [48]:
import faiss
import numpy as np

# Initialize FAISS index
dimension = 768  # Dimension of the vector embeddings (adjust as per your model)
index = faiss.IndexFlatL2(dimension)  # Example of using L2 distance metric

def build_index(doc_chunks, embeddings):
    for i, embedding in enumerate(embeddings):
        index.add(np.array([embedding]))  # Add each document's embedding to the index

def search_similar(query_embedding, k=5):
    D, I = index.search(np.array([query_embedding]), k)  # Search for k nearest neighbors
    return I[0]  # Return indices of the nearest neighbors in the index

# Example function to get embeddings (assuming they are precomputed)
def get_embeddings(doc_chunks):
    embeddings = []
    for chunk in doc_chunks:
        embedding = compute_embedding(chunk)  # Replace with your embedding function
        embeddings.append(embedding)
    return embeddings

# Replace with your actual embedding computation function
def compute_embedding(text):
    # Placeholder function for computing embeddings
    return np.random.rand(dimension)  # Example: Random vector

# Example function to handle a query
def handle_query(query, doc_chunks, embeddings):
    query_embedding = compute_embedding(query)  # Compute embedding for the query
    similar_indices = search_similar(query_embedding)
    similar_chunks = [doc_chunks[i] for i in similar_indices]
    return similar_chunks

# Main function
def main():
    # Load and preprocess documents
    documents = load_documents('E:/Assignment2/data/milestone_papers_text')
    doc_chunks = preprocess_documents(documents)

    # Compute embeddings for document chunks
    embeddings = get_embeddings(doc_chunks)

    # Build index with embeddings
    build_index(doc_chunks, embeddings)

    # Example query
    query = "What are some milestone model architectures and papers in the last few years?"

    # Handle query
    similar_chunks = handle_query(query, doc_chunks, embeddings)
    print("Similar chunks:")
    for chunk in similar_chunks:
        print(chunk)
    output_folder = r"E:\Assignment2\notebooks"

    # Ensure the folder exists; create it if it doesn't
    os.makedirs(output_folder, exist_ok=True)

    # Example list of chunks

    # Define the filename within the output folder
    filename = os.path.join(output_folder, "a.txt")

    # Writing similar_chunks to a.txt
    with open(filename, 'w', encoding='utf-8') as f:
        for chunk in similar_chunks:
            f.write(chunk + '\n')  
if __name__ == "__main__":
    main()


Similar chunks:
CSCW (2019).

Nayeon Lee, Yejin Bang, Andrea Madotto, Madian Khabsa, and Pascale Fung. 2021a. Towards Few-shot Fact-Checking via

Perplexity. In NAACL .

Yong Jae Lee, Joydeep Ghosh, and K. Grauman. 2012. Discovering important people and objects for egocentric video

summarization. 2012 IEEE Conference on Computer Vision and Pattern Recognition (2012), 1346–1353.

James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, and Santiago Ontanon. 2021. FNet: Mixing Tokens with Fourier Transforms.
exhibit harmful biases targeting specific groups (e.g., gender and racial bias) [Bender et al .2021;

Basta et al .2019; Kurita et al .2019; Sheng et al .2019] and can produce toxic outputs [Gehman et al .

2020] (§5.2: misuse ). While strategies such as further fine-tuning the foundation model on carefully

curated datasets (for potentially multiple generations) [Solaiman and Dennison 2021] or applying

controllable generation techniques [Keskar et al .2019] have shown some success in mitig

In [25]:
output_folder="E:\Assignment2\notebooks\"

SyntaxError: EOL while scanning string literal (3968283757.py, line 1)

In [26]:
import os

# Define the output folder path
output_folder = r"E:\Assignment2\notebooks"

# Ensure the folder exists; create it if it doesn't
os.makedirs(output_folder, exist_ok=True)

# Example list of chunks

# Define the filename within the output folder
filename = os.path.join(output_folder, "a.txt")

# Writing similar_chunks to a.txt
with open(filename, 'w', encoding='utf-8') as f:
    for chunk in similar_chunks:
        f.write(chunk + '\n')  # Write each chunk followed by a newline

print(f"Similar chunks written to {filename}")


NameError: name 'similar_chunks' is not defined

In [None]:
for chunk in similar_chunks:
        print(chunk)

In [2]:
import os 
output_folder = r"E:\Assignment2\notebooks"
filename = os.path.join(output_folder, "a.txt")

# Read the contents of the text document
with open(filename, 'r', encoding='utf-8') as f:
    text_content = f.read()


In [3]:
text_content

'of service, find relevant patents, and conduct other pre-litigation processes in order to ensure\n\nthat their clients are at an advantage [Betts and Jaep 2017; Elwany et al .2019; Lippi et al .2019;\n\nLee and Hsiang 2019; Hendrycks et al .2021c; Hegel et al .2021]. Notably, recent work has both\n\ndescribed the challenges and benefits of using foundation models for contract review [Leivaditi\n\net al.2020; Hegel et al .2021; Hendrycks et al .2021c]. In addition to reviewing and drafting legal\nHerv\x13 e J\x13 egou. Large memory layers with product keys. In Advances in Neural Information\n\nProcessing Systems , pages 8548{8559, 2019.\n\nKatherine Lee, Daphne Ippolito, Andrew Nystrom, Chiyuan Zhang, Douglas Eck, Chris\n\nCallison-Burch, and Nicholas Carlini. Deduplicating training data makes language models\n\nbetter. arXiv preprint arXiv:2107.06499 , 2021.\n\nDmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping\nA:Thecoin wasﬂipped byMaybelle. Sothecoin wa

In [43]:
from transformers import pipeline
import os
from langchain.chains import SequentialChain, LLMChain
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
# Define the path to the text documen
# Create HuggingFaceHub configuration
llm1 = HuggingFaceHub(
    repo_id='meta-llama/Meta-Llama-3-8B-Instruct', 
    model_kwargs={'temperature': 0.3, 'max_length': 256}
)

# Define prompts for each step in the sequential chain
preprocess_prompt = PromptTemplate.from_template("{query}. Write answer of the above question from the below text. {text}. ")
analysis_prompt = PromptTemplate.from_template("Analysis prompt: {text}. Query: {query}")

# Create the SequentialChain
sequential_chain = SequentialChain(
    chains=[
        LLMChain(llm=llm1, prompt=preprocess_prompt, output_key="key_info"),
        LLMChain(llm=llm1, prompt=analysis_prompt, output_key="analysis")
    ],
    input_variables=["text","query"],
    output_variables=["key_info"]
)

# Prompt user for a query
query = "What are some milestone model architectures and papers in the last few years?"


# Process text_content through the SequentialChain
response = sequential_chain({"text": text_content, "query": query})

# Print the generated answer
print(f"Response: {response}")


Response: {'text': 'of service, find relevant patents, and conduct other pre-litigation processes in order to ensure\n\nthat their clients are at an advantage [Betts and Jaep 2017; Elwany et al .2019; Lippi et al .2019;\n\nLee and Hsiang 2019; Hendrycks et al .2021c; Hegel et al .2021]. Notably, recent work has both\n\ndescribed the challenges and benefits of using foundation models for contract review [Leivaditi\n\net al.2020; Hegel et al .2021; Hendrycks et al .2021c]. In addition to reviewing and drafting legal\nHerv\x13 e J\x13 egou. Large memory layers with product keys. In Advances in Neural Information\n\nProcessing Systems , pages 8548{8559, 2019.\n\nKatherine Lee, Daphne Ippolito, Andrew Nystrom, Chiyuan Zhang, Douglas Eck, Chris\n\nCallison-Burch, and Nicholas Carlini. Deduplicating training data makes language models\n\nbetter. arXiv preprint arXiv:2107.06499 , 2021.\n\nDmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping\nA:Thecoin wasﬂipped byMay

In [44]:
response_text=response["key_info"]

In [45]:
text_content_position = response_text.find(text_content)
if text_content_position != -1:
    response_text = response_text[text_content_position + len(text_content):]


In [46]:
print(response_text)

. 2019; Kroll et al.2020]. Moreover, the lack of transparency and accountability in AI

systems can lead to a lack of trust in the decision-making process [Kroll et al.2020; Wang et al.

2020]. Therefore, it is crucial to develop AI systems that are transparent, explainable, and

accountable [Kroll et al.2020; Wang et al.2020]. Recent work has proposed various methods for

improving the transparency


In [51]:
len("many other researchers and engineers. We acknowledge the immense intellectual and practical contribution the full potential of foundation models, as is envisioned and discussed in detail throughout this report, will rely on research of new architectural and modeling advances to fulfill these desiderata.On the Opportunities and Risks of Foundation Models 81 4.2 Training Authors: Alex Tamkin Training objectives are mathematical functions describing how to transform a model architecture and large amount of broad data into a foundation model. For example, G")

559