In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Sample data (replace with your actual data source)
df = pd.read_csv('.../final_clustered_tweets.csv')

# Load pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Move the model to the GPU if available
model.to(device)
model.eval()

# Define grouped and individual keywords
grouped_keywords = {
    'high_grouped': [
        ['stranded', 'trapped', 'injured'],
        ['emergency', 'urgent', 'SOS'],
        ['danger', 'alert', 'casualty'],
        ['dead', 'evacuate', 'death'],
        ['flooding', 'missing', 'evacuating'],
        ['disaster', 'destroyed', 'mandatory evacuation'],
        ['quake hit', 'category storm'],
        ['newborns', 'children', 'women', 'injured']
    ],
    'medium_grouped': [
        ['rescue', 'help', 'helpline'],
        ['newborns intensive care', 'care flown'],
        ['lost', 'response', 'families'],
        ['hit', 'quake', 'killed'],
        ['victim', 'toll', 'hospital'],
        ['flee texas', 'damage', 'shelter'],
        ['redcross', 'food', 'building'],
        ['hit nepal', 'pet', 'thousands flee'],
        ['biggest flood', 'help nepal']
    ],
    'low_grouped': [
        ['rain', 'reports', 'service'],
        ['donate', 'donation', 'safety'],
        ['donated', 'support', 'intensive care'],
        ['contact', 'safety', 'unicef'],
        ['million', 'contact']
    ]
}

individual_keywords = {
    'high_individual': ['stranded', 'trapped', 'injure', 'emergency', 'urgent', 'SOS', 'danger', 'alert', 'casualty', 'injured', 'dead', 'evacuate', 'death', 'flooding', 'missing', 'evacuating', 'disaster', 'destroyed', 'mandatory evacuation', 'quake hit', 'category storm', 'newborns', 'children', 'women'],
    'medium_individual': ['rescue', 'help', 'helpline', 'newborns intensive care', 'care flown', 'texas officials', 'lost', 'response', 'families', 'hit', 'quake', 'killed', 'victim', 'toll', 'hospital', 'flee texas', 'damage', 'shelter', 'redcross', 'food', 'building', 'hit nepal', 'pet', 'thousands flee', 'biggest flood', 'help nepal'],
    'low_individual': ['rain', 'reports', 'service', 'donate', 'donation', 'safety', 'donated', 'support', 'intensive care', 'contact', 'safety', 'unicef', 'million', 'contact']
}

weights = {
    'high_grouped': 6,
    'high_individual': 5,
    'medium_grouped': 4,
    'medium_individual': 3,
    'low_grouped': 2,
    'low_individual': 1
}


# Function to compute BERT vector for a tweet
def bert_tweet_vector(tweet, tokenizer, model, device):
    inputs = tokenizer(tweet, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    return cls_embedding.reshape(1, -1)  # Reshape to 2D array

# Function to compute group vector
def compute_group_vector(group, tokenizer, model, device):
    group_vecs = [bert_tweet_vector(keyword, tokenizer, model, device) for keyword in group]
    return np.mean(group_vecs, axis=0)

# Function to compute urgency score
def compute_urgency_score_bert(tweet, tokenizer, model, grouped_keywords, individual_keywords, weights, device):
    tweet_vec = bert_tweet_vector(tweet, tokenizer, model, device)
    total_score = 0

    # Handle grouped keywords
    for category, groups in grouped_keywords.items():
        for group in groups:
            group_vec = compute_group_vector(group, tokenizer, model, device)
            total_score += cosine_similarity(tweet_vec, group_vec)[0][0] * weights[category]

    # Handle individual keywords
    for category, keywords in individual_keywords.items():
        for keyword in keywords:
            keyword_vec = bert_tweet_vector(keyword, tokenizer, model, device)
            total_score += cosine_similarity(tweet_vec, keyword_vec)[0][0] * weights[category]

    return total_score

# Compute weighted urgency scores
df['urgency_score'] = df['tidy_tweet'].apply(lambda x: compute_urgency_score_bert(x, tokenizer, model, grouped_keywords, individual_keywords, weights, device))

# Rank tweets based on urgency scores
ranked_tweets = df.sort_values(by='urgency_score', ascending=False)

# Display top-ranked tweets
print(ranked_tweets[['tweet_id', 'tidy_tweet', 'urgency_score']].head(10))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
