In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import util


In [2]:
df = pd.read_csv(
    "D:/Projects/nlp_qa_platform/data/processed/final_dataset_with_clusters.csv",
    encoding="latin1",
    low_memory=False
)

embeddings = np.load(
    "D:/Projects/nlp_qa_platform/data/embeddings/question_embeddings.npy"
)

df.shape, embeddings.shape


((25000, 7), (25000, 384))

In [3]:
def normalize(series):
    return (series - series.min()) / (series.max() - series.min() + 1e-9)

df["score_norm"] = normalize(df["Score"].fillna(0))


In [4]:
cluster_centroids = {}
for cid in df["cluster_id"].unique():
    idx = df.index[df["cluster_id"] == cid].tolist()
    cluster_centroids[cid] = embeddings[idx].mean(axis=0)

def semantic_quality(i):
    cid = df.loc[i, "cluster_id"]
    centroid = cluster_centroids[cid]
    return util.cos_sim(embeddings[i], centroid).item()

df["semantic_quality"] = [semantic_quality(i) for i in range(len(df))]
df["semantic_quality"] = normalize(df["semantic_quality"])


In [5]:
# If you saved Auto_Tags earlier for full df, use them.
# Otherwise, a proxy: use keyword density in Processed_Text.

def tag_richness(text):
    return min(len(text.split()) / 100, 1.0)  # simple, stable proxy

df["tag_richness"] = df["Processed_Text"].apply(tag_richness)


In [6]:
# Weights (explainable & tunable)
W_SEM = 0.45   # semantic quality
W_TAG = 0.20   # tag richness
W_VOTE = 0.35  # community signal

df["final_rank_score"] = (
    W_SEM * df["semantic_quality"] +
    W_TAG * df["tag_richness"] +
    W_VOTE * df["score_norm"]
)


In [7]:
top = df.sort_values("final_rank_score", ascending=False).head(10)

top[[
    "final_rank_score",
    "semantic_quality",
    "tag_richness",
    "score_norm",
    "Processed_Text"
]]


Unnamed: 0,final_rank_score,semantic_quality,tag_richness,score_norm,Processed_Text
5959,0.674494,0.480142,1.0,0.738372,efficient elegant way parse flat table tree as...
10320,0.65814,1.0,1.0,0.023256,write form datum jquery modal popup box databa...
18235,0.647904,0.978762,1.0,0.021318,insert form datum pdo work multipage form like...
6633,0.642624,0.967027,1.0,0.021318,activity listview element know set activity la...
22156,0.642619,0.965509,1.0,0.023256,update database databound controls currently c...
14648,0.638979,0.958928,1.0,0.021318,add parameter post datum submit possible add p...
21095,0.636811,0.952603,1.0,0.023256,allow set table join spring data jpa specifica...
17142,0.635779,0.953326,1.0,0.01938,item detail learn android development problem ...
13039,0.634886,0.949832,1.0,0.021318,android addview linearlayout visible create si...
3601,0.634475,0.948919,1.0,0.021318,store selectedrow value session commandfield s...


In [8]:
df.to_csv(
    "D:/Projects/nlp_qa_platform/data/processed/final_dataset_ranked.csv",
    index=False
)
