In [9]:
import torch
import numpy as np
import pandas as pd

from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

torch.manual_seed(42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


<torch._C.Generator at 0x269d75b2650>

In [14]:
from nltk.tokenize import sent_tokenize


In [10]:
MODEL_NAME = "distilbert-base-uncased"

tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)

model.eval()  # inference only


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [11]:
def encode_sentence(sentence):
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding=True
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # CLS token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embedding


In [12]:
def is_valid_message(text):
    if not isinstance(text, str):
        return False
    if len(text.strip()) < 10:
        return False
    if text.isupper():
        return False
    return True


In [29]:
import re

def is_meaningful_text(text):
    if not isinstance(text, str):
        return False

    text = text.strip()

    # Too short
    if len(text) < 15:
        return False

    # No vowels → likely gibberish
    if not re.search(r"[aeiouAEIOU]", text):
        return False

    # Repeated random characters
    if len(set(text)) < 5:
        return False

    # Single-word junk
    if len(text.split()) < 3:
        return False

    return True


In [30]:
def safe_sentence_split(text):
    try:
        return sent_tokenize(text)
    except:
        # fallback: simple rule-based split
        return [s.strip() for s in text.split(".") if len(s.strip()) > 5]
    
    
def generate_summary(message, max_sentences=2):
    if not is_meaningful_text(message):
        return None   # or "Invalid / noise message"

    sentences = safe_sentence_split(message)

    if len(sentences) <= max_sentences:
        return message.strip()

    embeddings = np.vstack([encode_sentence(s) for s in sentences])
    centroid = embeddings.mean(axis=0, keepdims=True)

    scores = cosine_similarity(embeddings, centroid).flatten()
    top_ids = np.argsort(scores)[::-1][:max_sentences]

    return " ".join([sentences[i] for i in top_ids])


# def generate_summary(message, max_sentences=2):
#     if not is_valid_message(message):
#         return "Message not suitable for summarization."
    
#     sentences = safe_sentence_split(message)
    
#     if len(sentences) <= max_sentences:
#         return message.strip()
    
#     sentence_embeddings = np.vstack(
#         [encode_sentence(s) for s in sentences]
#     )
    
#     document_embedding = np.mean(sentence_embeddings, axis=0, keepdims=True)
    
#     similarities = cosine_similarity(sentence_embeddings, document_embedding)
    
#     ranked_indices = np.argsort(similarities.flatten())[::-1]
    
#     selected_sentences = [
#         sentences[i] for i in ranked_indices[:max_sentences]
#     ]
    
#     return ". ".join(selected_sentences)


In [32]:
test_messages = [
    "People are trapped on rooftops. Water level is rising fast. Need immediate rescue.",
 """
Heavy rainfall has been continuing for the past 36 hours in the low-lying areas near the river.
Several houses are completely submerged and many families have moved to rooftops to save themselves.
Water levels are still rising rapidly and strong currents are making rescue operations difficult.
Electricity has been cut off in most areas and mobile networks are unstable.
There is an urgent need for boats, food packets, clean drinking water, and medical assistance, especially for elderly people and children.
Some people are injured and unable to move, while others are trapped inside partially collapsed houses.
Local authorities have been informed, but immediate external help is required to prevent loss of life.
""",
"gyuvuyjujvujv. jbjyuy"
]

for msg in test_messages:
    print("MESSAGE:", msg)
    print("SUMMARY:", generate_summary(msg))
    print("-" * 80)


MESSAGE: People are trapped on rooftops. Water level is rising fast. Need immediate rescue.
SUMMARY: People are trapped on rooftops. Need immediate rescue.
--------------------------------------------------------------------------------
MESSAGE: 
Heavy rainfall has been continuing for the past 36 hours in the low-lying areas near the river.
Several houses are completely submerged and many families have moved to rooftops to save themselves.
Water levels are still rising rapidly and strong currents are making rescue operations difficult.
Electricity has been cut off in most areas and mobile networks are unstable.
There is an urgent need for boats, food packets, clean drinking water, and medical assistance, especially for elderly people and children.
Some people are injured and unable to move, while others are trapped inside partially collapsed houses.
Local authorities have been informed, but immediate external help is required to prevent loss of life.

SUMMARY: Water levels are still ri

In [19]:
import pandas as pd

# Base path
BASE_PATH = "../data/raw/"

# Load datasets
df  = pd.read_csv(BASE_PATH + "disaster_response_messages_training.csv")
dft = pd.read_csv(BASE_PATH + "disaster_response_messages_test.csv")
dfv = pd.read_csv(BASE_PATH + "disaster_response_messages_validation.csv")


  df  = pd.read_csv(BASE_PATH + "disaster_response_messages_training.csv")


In [20]:
dft = dft.copy()
dft["message"] = dft["message"].astype(str).fillna("")


In [21]:
dft["summary"] = dft["message"].apply(generate_summary)


In [22]:
sample = dft[["message", "summary"]].sample(10, random_state=42)

for _, row in sample.iterrows():
    print("MESSAGE:")
    print(row["message"])
    print("\nSUMMARY:")
    print(row["summary"])
    print("=" * 100)


MESSAGE:
Hello, what can be done to find a good answer from 4636

SUMMARY:
Hello, what can be done to find a good answer from 4636
MESSAGE:
I am Pestel, there is no food, there is nothing. We are at Dipi's house, please, we are waiting for you 

SUMMARY:
I am Pestel, there is no food, there is nothing. We are at Dipi's house, please, we are waiting for you
MESSAGE:
how some one can send his colum vital for jobpam.com if he gets it on his jump? 

SUMMARY:
how some one can send his colum vital for jobpam.com if he gets it on his jump?
MESSAGE:
- Additional accommodation for visiting staff would be secured through hiring rooms at hotels in Mansehra; more staff than capacity is presently accommodated at the office premises in Mansehra.

SUMMARY:
- Additional accommodation for visiting staff would be secured through hiring rooms at hotels in Mansehra; more staff than capacity is presently accommodated at the office premises in Mansehra.
MESSAGE:
Schools and kindergartens were closed Tuesday

In [23]:
# Test only messages with multiple sentences

long_msgs = dft[
    dft["message"].str.count(r"\.") >= 2
].sample(5, random_state=42)

for _, row in long_msgs.iterrows():
    print("ORIGINAL:")
    print(row["message"])
    print("\nSUMMARY:")
    print(row["summary"])
    print("=" * 100)


ORIGINAL:
Mined roads and footpaths impede repatriation of refugees and returnees, and mined farmland precludes agricultural production. On 3 August, rioters attacked Alang Asaude in western Seram. Besides Marburg and Ebola, other zoonotic diseases that have struck around the world are swine flu, H1N1 (bird flu), bubonic plague and rabies. It will also finance a 60 km natural gas pipeline that will improve the reliability of gas supply to the plant, and an 11 km electricity transmission line so that power from the plant can be distributed to consumers. I am equally concerned about violations by anti-government forces, including murder, extrajudicial execution and torture as well as the recently increased use of improvised explosive devices. Emphasizes the importance of regional cooperation and coordination in disaster reduction, including enhanced institutional arrangements, technical cooperation based on most effective technical equipment and capacity building to effectively address t

In [24]:
def generate_summary_debug(message, max_sentences=2):
    sentences = safe_sentence_split(message)
    
    print(f"Total sentences detected: {len(sentences)}")
    
    if len(sentences) <= max_sentences:
        print("→ Returning original (too short to summarize)")
        return message.strip()
    
    embeddings = np.vstack([encode_sentence(s) for s in sentences])
    centroid = embeddings.mean(axis=0, keepdims=True)
    sims = cosine_similarity(embeddings, centroid).flatten()
    
    for i, (s, sc) in enumerate(zip(sentences, sims)):
        print(f"[{i}] score={sc:.4f} | {s}")
    
    top_ids = np.argsort(sims)[::-1][:max_sentences]
    return " ".join([sentences[i] for i in top_ids])
