In [2]:
import requests
from newspaper import Article

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_url = "https://www.news24.com/news24/politics/political-parties/anc-u-turn-on-mbalulas-threats-to-remove-pravin-gordhan-over-railway-problems-20230723"

session = requests.Session()

try:
    response = session.get(article_url, headers=headers, timeout=10)
    
    if response.status_code == 200:
        article = Article(article_url)
        article.download()
        article.parse()
        
        print(f"Title: {article.title}")
        print(f"Text: {article.text}")
        
    else:
        print(f"Failed to fetch article at {article_url}")
except Exception as e:
    print(f"Error occurred while fetching article at {article_url}: {e}")

Title: ANC U-turn on Mbalula's threats to remove Pravin Gordhan over railway problems
Text: ANC secretary-general Fikile Mbalula made a threat that if Public Enterprises Minister Pravin Gordhan does not perform, he will be removed from his job.

Mbalula blamed the high number of accidents on the roads on the presence of trucks carrying goods that could be transported by rail.

A few hours after this threat was issued, the ANC said Gordhan's job was not on the line, as alleged by Mbalula.

Hours after ANC secretary-general Fikile Mbalula warned of removing Public Enterprises Minister Pravin Gordhan if he does not perform, the ANC made a U-turn saying the minister was safe in his job.

Mbalula, in a speech at the ANC Women's League conference in Johannesburg on Saturday, told delegates that the party expected performance from its deployees to the government.

He singled out Gordhan and said he had to work quickly in fixing the problems faced by the country's rail system - saying if he di

In [14]:
from langchain.schema import (
    HumanMessage
)

# we get the article data from the scraping part
article_title = article.title
article_text = article.text

# prepare template for prompt
template = """
As an advanced AI, you've been tasked to summarize online articles into bulleted points. Here are a few examples of how you've done this in the past:

Example 1:
Original Article: 'The Effects of Climate Change
Summary:
- Climate change is causing a rise in global temperatures.
- This leads to melting ice caps and rising sea levels.
- Resulting in more frequent and severe weather conditions.

Example 2:
Original Article: 'The Evolution of Artificial Intelligence
Summary:
- Artificial Intelligence (AI) has developed significantly over the past decade.
- AI is now used in multiple fields such as healthcare, finance, and transportation.
- The future of AI is promising but requires careful regulation.

Now, here's the article you need to summarize:

==================
Title: {article_title}

{article_text}
==================

Please provide a summarized version of the article in a bulleted list format.
"""

# Format the Prompt
prompt = template.format(article_title=article.title, article_text=article.text)

messages = [HumanMessage(content=prompt)]

In [15]:
from langchain.chat_models import ChatOpenAI

# load the model
chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [16]:
# generate summary
summary = chat(messages)
print(summary.content)

- ANC secretary-general Fikile Mbalula threatened to remove Public Enterprises Minister Pravin Gordhan if he does not perform.
- Mbalula blamed the high number of road accidents on trucks carrying goods that could be transported by rail.
- The ANC later made a U-turn, stating that Gordhan's job was safe.
- Mbalula emphasized the need for Gordhan to quickly fix the problems in the country's rail system.
- The ANC clarified that calling for policy interventions does not mean calling for Gordhan's removal.
- Gordhan recently appointed a new board at Transnet to address the struggling state-owned enterprise.


## Output Parsers


In [17]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import validator
from pydantic import BaseModel, Field
from typing import List


# create output parser class
class ArticleSummary(BaseModel):
    title: str = Field(description="Title of the article")
    summary: List[str] = Field(description="Bulleted list summary of the article")

    # validating whether the generated summary has at least three lines
    @validator('summary')
    def has_three_or_more_lines(cls, list_of_lines):
        if len(list_of_lines) < 3:
            raise ValueError("Generated summary has less than three bullet points!")
        return list_of_lines

# set up output parser
parser = PydanticOutputParser(pydantic_object=ArticleSummary)

In [18]:
from langchain.prompts import PromptTemplate


# create prompt template
# notice that we are specifying the "partial_variables" parameter
template = """
You are a very good assistant that summarizes online articles.

Here's the article you want to summarize.

==================
Title: {article_title}

{article_text}
==================

{format_instructions}
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["article_title", "article_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

# Format the prompt using the article title and text obtained from scraping
formatted_prompt = prompt.format_prompt(article_title=article_title, article_text=article_text)

In [19]:
from langchain.llms import OpenAI


# instantiate model class
model = OpenAI(model_name="text-davinci-003", temperature=0.0)

# Use the model to generate a summary
output = model(formatted_prompt.to_string())

# Parse the output into the Pydantic model
parsed_output = parser.parse(output)
print(parsed_output)

title="ANC U-turn on Mbalula's threats to remove Pravin Gordhan over railway problems" summary=['ANC secretary-general Fikile Mbalula made a threat that if Public Enterprises Minister Pravin Gordhan does not perform, he will be removed from his job.', "A few hours after this threat was issued, the ANC said Gordhan's job was not on the line, as alleged by Mbalula.", 'Mbalula blamed the high number of accidents on the roads on the presence of trucks carrying goods that could be transported by rail.', 'Former transport minister Mbalula blamed the high number of road deaths on the increased presence of trucks carrying goods that could otherwise be transported by rail.', "But on Saturday night, the ANC issued a statement qualifying Mbalula's comments and saying Gordhan was safe in his job.", 'Gordhan recently announced a new board at Transnet had been appointed to turn around the struggling state-owned enterprise.']


## Embeddings

In [21]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import OpenAIEmbeddings

# Define the documents
documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard.",
]

# Initialize the OpenAIEmbeddings instance
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for the documents
document_embeddings = embeddings.embed_documents(documents)

# Perform a similarity search for a given query
query = "A cat is sitting on a mat."
query_embedding = embeddings.embed_query(query)

# Calculate similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

# Find the most similar document
most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]

print(f"Most similar document to the query '{query}':")
print(most_similar_document)



Most similar document to the query 'A cat is sitting on a mat.':
The cat is on the mat.


## Vectors storage

In [22]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [24]:
# create our documents
texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963",
    "Toni Kroos was 15 february 1988"
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)

In [25]:
# initialize embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "sathekgealoy"
my_activeloop_dataset_name = "langchain_course_embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

# add documents to our Deep Lake dataset
db.add_documents(docs)

Your Deep Lake dataset has been successfully created!


 

Dataset(path='hub://sathekgealoy/langchain_course_embeddings', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (5, 1536)  float32   None   
    id        text      (5, 1)      str     None   
 metadata     json      (5, 1)      str     None   
   text       text      (5, 1)      str     None   


['5fb877c5-2a35-11ee-b60b-7ce9d32d57bd',
 '5fb877c6-2a35-11ee-98a7-7ce9d32d57bd',
 '5fb877c7-2a35-11ee-b082-7ce9d32d57bd',
 '5fb877c8-2a35-11ee-b8f9-7ce9d32d57bd',
 '5fb877c9-2a35-11ee-9aea-7ce9d32d57bd']

In [26]:
# create retriever from db
retriever = db.as_retriever()

In [29]:
# istantiate the llm wrapper
model = ChatOpenAI(model='gpt-3.5-turbo')

# create the question-answering chain
qa_chain = RetrievalQA.from_llm(model, retriever=retriever)

# ask a question to the chain
qa_chain.run("When was toni kros born?")

'Toni Kroos was born on 15 February 1988.'