In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [20]:
# load dataset 
DATA_PATH = Path("dataset.csv")
df = pd.read_csv(DATA_PATH)

# shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Strip whitespace and standardize
df["text"] = df["text"].astype(str).str.strip()
df["intent"] = df["intent"].astype(str).str.strip()

# Label distribution
label_counts = df["intent"].value_counts()
display(label_counts)


#split dataset 
X = df["text"]
y = df["intent"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")


intent
escalation_policy     97
code_review_policy    96
onboarding_guide      96
deployment_process    95
team_structure        95
employees_info        87
jira_ticket_status    87
deployment_history    86
dev_env_setup         86
out_of_scope          80
Name: count, dtype: int64

Train: 724, Test: 181


In [5]:
# Initialize the TF-IDF vectorizer
"""vectorizer = TfidfVectorizer(
    lowercase=True,          # normalize casing
    stop_words='english',    # drop filler words
    ngram_range=(1,2),       # capture unigrams + bigrams
    max_features=5000        # cap vocab size for stability
)
"""
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 3),         # added trigrams
    analyzer='char_wb',         # character-level within words
    max_features=10000
)
# Fit on training text, transform both sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
# Load encoder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Encode text to dense vectors
X_train_emb = embedder.encode(X_train.tolist(), normalize_embeddings=True)
X_test_emb  = embedder.encode(X_test.tolist(),  normalize_embeddings=True)

# Encode labels numerically
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

In [7]:
clf_sem = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    multi_class="multinomial",
    random_state=42
)
clf_sem.fit(X_train_emb, y_train_enc)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [10]:
y_pred_sem = clf_sem.predict(X_test_emb)
print(classification_report(y_test_enc, y_pred_sem, target_names=le.classes_))

                    precision    recall  f1-score   support

code_review_policy       0.94      0.89      0.92        19
deployment_history       0.81      0.76      0.79        17
deployment_process       0.89      0.89      0.89        19
     dev_env_setup       0.94      1.00      0.97        17
    employees_info       0.67      0.67      0.67        18
 escalation_policy       0.95      1.00      0.97        19
jira_ticket_status       0.93      0.78      0.85        18
  onboarding_guide       0.86      0.95      0.90        19
      out_of_scope       0.84      1.00      0.91        16
    team_structure       0.76      0.68      0.72        19

          accuracy                           0.86       181
         macro avg       0.86      0.86      0.86       181
      weighted avg       0.86      0.86      0.86       181



# combine tf idf vector features with embedding features

In [8]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
import numpy as np

# ---- 1Ô∏è‚É£ Scale embeddings (important because TF-IDF is normalized 0‚Äì1) ----
scaler = StandardScaler(with_mean=False)
X_train_emb_scaled = scaler.fit_transform(X_train_emb)
X_test_emb_scaled = scaler.transform(X_test_emb)

# ---- 2Ô∏è‚É£ Combine TF-IDF (sparse) + embeddings (dense) ----
X_train_combined = hstack([
    X_train_vec, 
    np.nan_to_num(X_train_emb_scaled)  # handle any nan safely
])

X_test_combined = hstack([
    X_test_vec,
    np.nan_to_num(X_test_emb_scaled)
])

print(f"Combined train shape: {X_train_combined.shape}")
print(f"Combined test shape: {X_test_combined.shape}")


Combined train shape: (724, 2862)
Combined test shape: (181, 2862)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf_hybrid = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    multi_class="multinomial",
    random_state=42
)
clf_hybrid.fit(X_train_combined, y_train)

y_pred_hybrid = clf_hybrid.predict(X_test_combined)
print(classification_report(y_test, y_pred_hybrid))




                    precision    recall  f1-score   support

code_review_policy       0.94      0.79      0.86        19
deployment_history       0.83      0.88      0.86        17
deployment_process       0.89      0.89      0.89        19
     dev_env_setup       0.88      0.88      0.88        17
    employees_info       0.75      0.83      0.79        18
 escalation_policy       0.90      1.00      0.95        19
jira_ticket_status       0.94      0.83      0.88        18
  onboarding_guide       0.86      0.95      0.90        19
      out_of_scope       1.00      1.00      1.00        16
    team_structure       0.82      0.74      0.78        19

          accuracy                           0.88       181
         macro avg       0.88      0.88      0.88       181
      weighted avg       0.88      0.88      0.88       181



In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Normalize embeddings (important for cosine)
X_train_norm = X_train_emb / np.linalg.norm(X_train_emb, axis=1, keepdims=True)
X_test_norm = X_test_emb / np.linalg.norm(X_test_emb, axis=1, keepdims=True)

# Compute cosine similarities
sims = cosine_similarity(X_test_norm, X_train_norm)

# Predict label of nearest neighbor (Top-1)
y_pred_nn = [y_train.values[np.argmax(row)] for row in sims]


print("üîç Embedding Similarity Search (Top-1) performance:")
print(classification_report(y_test, y_pred_nn))


üîç Embedding Similarity Search (Top-1) performance:
                    precision    recall  f1-score   support

code_review_policy       0.89      0.89      0.89        19
deployment_history       0.83      0.88      0.86        17
deployment_process       0.84      0.84      0.84        19
     dev_env_setup       0.78      0.82      0.80        17
    employees_info       0.56      0.50      0.53        18
 escalation_policy       0.89      0.89      0.89        19
jira_ticket_status       0.78      0.78      0.78        18
  onboarding_guide       0.80      0.84      0.82        19
      out_of_scope       1.00      0.94      0.97        16
    team_structure       0.63      0.63      0.63        19

          accuracy                           0.80       181
         macro avg       0.80      0.80      0.80       181
      weighted avg       0.80      0.80      0.80       181



trying embedding with logistic 

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import numpy as np

# 1Ô∏è‚É£ Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["intent"],
    test_size=0.2,
    random_state=42,
    stratify=df["intent"]  # preserve intent distribution
)

# 2Ô∏è‚É£ Define embedder (same as before)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 3Ô∏è‚É£ Wrap it as a scikit-learn compatible encoder
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def transform(self, X):
        return self.embedding_model.encode(list(X), normalize_embeddings=True)

    def fit(self, X, y=None):
        return self

# 4Ô∏è‚É£ Build the pipeline
pipeline = Pipeline([
    ('encoder', Encoder(embedding_model)),
    ('clf', LogisticRegression(
        solver="lbfgs",
        max_iter=2000,
        multi_class="multinomial",
        random_state=42
    ))
])

# 5Ô∏è‚É£ Train
pipeline.fit(X_train, y_train)

# 6Ô∏è‚É£ Evaluate
y_pred = pipeline.predict(X_test)
print("\nLangfuse-style Supervised Embedding + LR:")
print(classification_report(y_test, y_pred))





Langfuse-style Supervised Embedding + LR:
                    precision    recall  f1-score   support

code_review_policy       0.94      0.89      0.92        19
deployment_history       0.81      0.76      0.79        17
deployment_process       0.89      0.89      0.89        19
     dev_env_setup       0.94      1.00      0.97        17
    employees_info       0.67      0.67      0.67        18
 escalation_policy       0.95      1.00      0.97        19
jira_ticket_status       0.93      0.78      0.85        18
  onboarding_guide       0.86      0.95      0.90        19
      out_of_scope       0.84      1.00      0.91        16
    team_structure       0.76      0.68      0.72        19

          accuracy                           0.86       181
         macro avg       0.86      0.86      0.86       181
      weighted avg       0.86      0.86      0.86       181

