In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvi

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from keybert import KeyBERT
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive

In [None]:
def clean_text(text):
    """Clean text by removing special characters and extra whitespace"""
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text
    return ''


In [None]:
def extract_keywords_for_row(row, kw_model, num_keywords=5):
    """Extract keywords for a single row"""
    # Combine text fields
    combined_text = f"{row['title']} {row['description']} {row['body']}"
    cleaned_text = clean_text(combined_text)

    if not cleaned_text:
        return [], []

    # Extract keywords
    keywords = kw_model.extract_keywords(
        cleaned_text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        use_maxsum=True,
        nr_candidates=20,
        top_n=num_keywords
    )

    # Separate keywords and scores
    if keywords:
        kw, scores = zip(*keywords)
        return list(kw), list(scores)
    return [], []

In [None]:
def process_dataset(file_path, num_keywords=5):
    """Process entire dataset and add keyword columns"""
    # Read CSV file
    print("Loading dataset...")
    df = pd.read_csv(file_path)

    # Initialize KeyBERT
    print("Initializing KeyBERT model...")
    kw_model = KeyBERT(model='all-MiniLM-L6-v2')

    # Initialize lists to store keywords and scores
    all_keywords = []
    all_scores = []

    # Process each row
    print("Extracting keywords for each row...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        keywords, scores = extract_keywords_for_row(row, kw_model, num_keywords)
        all_keywords.append(keywords)
        all_scores.append(scores)

    # Add new columns to dataframe
    df['keywords'] = all_keywords
    df['keyword_scores'] = all_scores

    # Add individual keyword columns for easier filtering
    for i in range(num_keywords):
        df[f'keyword_{i+1}'] = df['keywords'].apply(lambda x: x[i] if i < len(x) else '')
        df[f'keyword_{i+1}_score'] = df['keyword_scores'].apply(lambda x: x[i] if i < len(x) else 0.0)

    # Save results
    output_file = 'participedia_with_keywords.csv'
    df.to_csv(output_file, index=False)
    print(f"\nResults saved to {output_file}")

    return df

In [None]:
def display_sample_results(df, num_samples=5):
    """Display sample results from the processed dataset"""
    print("\n=== Sample Results ===")
    for i in range(min(num_samples, len(df))):
        print(f"\nDocument {i+1}:")
        print(f"Title: {df['title'].iloc[i][:100]}...")
        print("Keywords:", df['keywords'].iloc[i])
        print("Scores:", [f"{score:.3f}" for score in df['keyword_scores'].iloc[i]])
        print("-" * 80)


In [None]:
def main():
    # Process dataset
    df = process_dataset('/content/drive/MyDrive/Case_Participedia.csv')

    # Display sample results
    display_sample_results(df)

    # Print column information
    print("\nNew columns added to the dataset:")
    keyword_cols = [col for col in df.columns if 'keyword' in col.lower()]
    for col in keyword_cols:
        print(f"- {col}")

if __name__ == "__main__":
    main()

Loading dataset...
Initializing KeyBERT model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Extracting keywords for each row...


100%|██████████| 2272/2272 [20:15<00:00,  1.87it/s]



Results saved to participedia_with_keywords.csv

=== Sample Results ===

Document 1:
Title: British Columbia Citizens' Assembly on Electoral Reform...
Keywords: ['electoral alternatives', 'columbia voters', 'elected provincial', 'analyzing electoral', 'columbia referendum']
Scores: ['0.536', '0.538', '0.556', '0.561', '0.621']
--------------------------------------------------------------------------------

Document 2:
Title: Minneapolis Neighborhood Revitalization Program...
Keywords: ['legislature minneapolis', 'implementation downtown', 'neighborhood nrps', 'improve citys', 'residents revitalize']
Scores: ['0.514', '0.525', '0.527', '0.537', '0.547']
--------------------------------------------------------------------------------

Document 3:
Title: Wenling City Deliberative Poll...
Keywords: ['democracyreferences', 'municipalities china', 'debate deliberation', 'chinese deliberative', 'polling project']
Scores: ['0.480', '0.481', '0.481', '0.486', '0.488']
------------------------

embedding
if two have similar embedding, most likely they have the same topic

given this quary, what filter to apply on the data

reg system. embed the quary embedding


go through natural language to