In [5]:
from newspaper import Article
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

**1. Read the File**

Read the content of https://www.washingtonpost.com/world/2025/06/13/air-
india-plane-crash-survivor-vishwash-kumar-ramesh/ into a Python variable.
Load the first 700 characters.

In [8]:
url = (
    "https://www.washingtonpost.com/world/2025/06/13/"
    "air-india-plane-crash-survivor-vishwash-kumar-ramesh/"
)


article = Article(url)
article.download()
article.parse()


full_text = article.text


text_700 = full_text[:700]

print(text_700)
print(f"\nTotal characters loaded: {len(text_700)}\n")


Only one person on board Air India Flight 171 survived — British national Viswashkumar Ramesh, who could be seen limping past a crowd of shocked rescuers toward an ambulance shortly after the crash killed the other 241 passengers and crew members, as well as dozens of people on the ground in Ahmedabad.

Ramesh, 40, has been described as the “miracle in seat 11A” in British media, and several top Indian officials — including Prime Minister Narendra Modi — have visited him in the hospital.

“I don’t know how I survived,” Ramesh said in an interview from his hospital bed with broadcaster Doordarshan on Friday, with one arm heavily bandaged and a bloodied cut under his eye.

“I was on the side o

Total characters loaded: 700



**2. Split the Text into Sentences**  
Split the text into sentences using NLTK’s sentence tokenizer.

In [10]:
nltk.download("punkt",     quiet=True)
nltk.download("punkt_tab", quiet=True)

sentences = nltk.sent_tokenize(text_700)

for i, s in enumerate(sentences):
    print(f"[{i}] {s}")
print(f"\nTotal sentences: {len(sentences)}\n")

[0] Only one person on board Air India Flight 171 survived — British national Viswashkumar Ramesh, who could be seen limping past a crowd of shocked rescuers toward an ambulance shortly after the crash killed the other 241 passengers and crew members, as well as dozens of people on the ground in Ahmedabad.
[1] Ramesh, 40, has been described as the “miracle in seat 11A” in British media, and several top Indian officials — including Prime Minister Narendra Modi — have visited him in the hospital.
[2] “I don’t know how I survived,” Ramesh said in an interview from his hospital bed with broadcaster Doordarshan on Friday, with one arm heavily bandaged and a bloodied cut under his eye.
[3] “I was on the side o

Total sentences: 4



**3. Load a Pre-trained Embedding Model**  
Load a pre-trained sentence embedding model (such as all-MiniLM-L6-v2 from
the sentence-transformers library or anyone of your choice).
Secondly, use TF/IDF to vectorize the first ten sentences.

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print(f"Model loaded: all-MiniLM-L6-v2\n")

first_10 = sentences[:10]

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(first_10)

print(f"TF-IDF matrix shape : {tfidf_matrix.shape}")
print(f"  (rows = sentences, cols = unique tokens)\n")

# Preview the top-5 tokens by average TF-IDF weight
avg_weights = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
top_indices = avg_weights.argsort()[::-1][:5]
terms = tfidf.get_feature_names_out()
print("Top-5 TF-IDF tokens:", [terms[i] for i in top_indices], "\n")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Model loaded: all-MiniLM-L6-v2

TF-IDF matrix shape : (4, 87)
  (rows = sentences, cols = unique tokens)

Top-5 TF-IDF tokens: ['the', 'on', 'side', 'was', 'in'] 



**4. Embed Each Sentence**  
Generate an embedding for each sentence.
Print the shape of the embedding for the first sentence.

In [13]:
embeddings = model.encode(sentences)          # shape: (n_sentences, 384)

print(f"Embedding matrix shape     : {embeddings.shape}")
print(f"Shape of first sentence emb: {embeddings[0].shape}")
print(f"First sentence preview     : \"{sentences[0][:80]}...\"\n")

Embedding matrix shape     : (4, 384)
Shape of first sentence emb: (384,)
First sentence preview     : "Only one person on board Air India Flight 171 survived — British national Viswas..."



**5. Compute Similarity Between Sentences**  
Compute the cosine similarity between the embeddings of the first and second
sentences.
Print the similarity score.

In [14]:
# cosine_similarity expects 2-D arrays
emb0 = embeddings[0].reshape(1, -1)
emb1 = embeddings[1].reshape(1, -1)

similarity_score = cosine_similarity(emb0, emb1)[0][0]

print(f"Sentence 0 : {sentences[0]}")
print(f"Sentence 1 : {sentences[1]}")
print(f"\nCosine Similarity Score : {similarity_score:.4f}")
print(
    "(1.0 = identical meaning, 0.0 = completely unrelated, "
    "-1.0 = opposite meaning)\n"
)

Sentence 0 : Only one person on board Air India Flight 171 survived — British national Viswashkumar Ramesh, who could be seen limping past a crowd of shocked rescuers toward an ambulance shortly after the crash killed the other 241 passengers and crew members, as well as dozens of people on the ground in Ahmedabad.
Sentence 1 : Ramesh, 40, has been described as the “miracle in seat 11A” in British media, and several top Indian officials — including Prime Minister Narendra Modi — have visited him in the hospital.

Cosine Similarity Score : 0.4220
(1.0 = identical meaning, 0.0 = completely unrelated, -1.0 = opposite meaning)

