In [1]:
pip install keybert


Collecting keybert
  Using cached keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting rich>=10.4.0 (from keybert)
  Using cached rich-14.2.0-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=10.4.0->keybert)
  Using cached markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.4.0->keybert)
  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
Using cached keybert-0.9.0-py3-none-any.whl (41 kB)
Using cached rich-14.2.0-py3-none-any.whl (243 kB)
Using cached markdown_it_py-4.0.0-py3-none-any.whl (87 kB)
Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Installing collected packages: mdurl, markdown-it-py, rich, keybert
Successfully installed keybert-0.9.0 markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util


In [3]:
df = pd.read_csv(
    "D:/Projects/nlp_qa_platform/data/processed/final_dataset_with_clusters.csv",
    encoding="latin1",
    low_memory=False
)

df.head()


Unnamed: 0,Id,Full_Text,Tags_List,CreationDate,Score,Processed_Text,cluster_id
0,16700310,"compare two excel files in a ssis ""Foreach Loo...",['sql-server' 'sql-server-2008' 'tsql' 'ssis'],2013-05-22T19:41:53Z,0,compare excel file ssis foreach loop container...,1
1,11504380,"Black screen appears, randomly after taking an...",['iphone' 'ios' 'uiimage' 'uiimagepickercontro...,2012-07-16T12:31:16Z,4,black screen appear randomly take image tap us...,4
2,27754690,Java Include file to jar file I'm using Intell...,['java' 'intellij-idea' 'jar'],2015-01-03T12:08:56Z,0,java include file jar file intellij idea devel...,5
3,3761040,Adding a summary column to a Reporting Service...,['reporting-services'],2010-09-21T13:58:04Z,0,add summary column reporting services matrix s...,6
4,37224170,When function with for loop is called it only ...,['python'],2016-05-14T07:55:53Z,-2,function loop call return value dict understan...,12


In [4]:
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=sbert_model)


In [7]:
df["Tags_List"].dropna().head(5)


0       ['sql-server' 'sql-server-2008' 'tsql' 'ssis']
1    ['iphone' 'ios' 'uiimage' 'uiimagepickercontro...
2                       ['java' 'intellij-idea' 'jar']
3                               ['reporting-services']
4                                           ['python']
Name: Tags_List, dtype: object

In [8]:
import re
import pandas as pd

def parse_tags(tag_str):
    if pd.isna(tag_str):
        return []
    
    # Extract words inside single quotes
    tags = re.findall(r"'([^']+)'", tag_str)
    return [t.lower() for t in tags]


In [9]:
all_tags = df["Tags_List"].dropna().apply(parse_tags)

tag_vocab = sorted(
    set(tag for tags in all_tags for tag in tags)
)

len(tag_vocab)


9549

In [10]:
all_tags.head(5)


0            [sql-server, sql-server-2008, tsql, ssis]
1    [iphone, ios, uiimage, uiimagepickercontroller...
2                           [java, intellij-idea, jar]
3                                 [reporting-services]
4                                             [python]
Name: Tags_List, dtype: object

In [11]:
tag_embeddings = sbert_model.encode(
    tag_vocab,
    show_progress_bar=True
)


Batches:   0%|          | 0/299 [00:00<?, ?it/s]

In [12]:
def generate_auto_tags(text, top_k_keywords=5, top_k_tags=3):
    # Step 1: Extract keywords
    keywords = kw_model.extract_keywords(
        text,
        top_n=top_k_keywords,
        stop_words='english'
    )

    keyword_texts = [kw[0] for kw in keywords]

    # Step 2: Embed keywords
    keyword_embeddings = sbert_model.encode(keyword_texts)

    # Step 3: Match keywords to tags
    scores = util.cos_sim(keyword_embeddings, tag_embeddings)
    best_indices = scores.max(axis=0).values.argsort(descending=True)

    predicted_tags = [tag_vocab[i] for i in best_indices[:top_k_tags]]

    return predicted_tags


In [14]:
sample = df.sample(5, random_state=42)

for _, row in sample.iterrows():
    print("\nQUESTION:")
    print(row["Processed_Text"][:150])
    print("ORIGINAL TAGS:", row["Tags_List"])
    print("AUTO TAGS    :", generate_auto_tags(row["Processed_Text"]))



QUESTION:
vba adodb error value specify require parameter excel vba project access perform query etc system office english client system office spanish test mac
ORIGINAL TAGS: ['sql' 'vba' 'ms-access' 'adodb']
AUTO TAGS    : ['insert', 'vba', 'excel']

QUESTION:
print call way print activerecord operation call factory foo model foo run create database entry model foo entry different model bar car etc way conve
ORIGINAL TAGS: ['ruby-on-rails' 'ruby' 'rails-activerecord' 'factory-girl']
AUTO TAGS    : ['command', 'activerecord', 'factory']

QUESTION:
rubysdl ruby sdl ffi tell difference ruby gem rubysdl ruby sdl ffi like speed variance prefer wonder sake gem write rubydraw locate thank advance
ORIGINAL TAGS: ['ruby' 'rubygems' 'gem' 'sdl' 'ffi']
AUTO TAGS    : ['ruby', 'sdl', 'gem']

QUESTION:
nested template template infer error code compile line problem know fix template function alter little example run compiler say error matching functio
ORIGINAL TAGS: ['c++' 'templates']
AUTO TAGS 

In [15]:
def tag_overlap(true_tags, predicted_tags):
    true = set(true_tags)
    pred = set(predicted_tags)
    return len(true & pred) / max(len(true), 1)

df_eval = df.sample(1000, random_state=42)

df_eval["Parsed_Tags"] = df_eval["Tags_List"].apply(parse_tags)
df_eval["Auto_Tags"] = df_eval["Processed_Text"].apply(generate_auto_tags)

df_eval["overlap"] = df_eval.apply(
    lambda x: tag_overlap(x["Parsed_Tags"], x["Auto_Tags"]),
    axis=1
)

df_eval["overlap"].mean()


np.float64(0.328)