In [6]:
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
import re


In [2]:
# load dataset 
DATA_PATH = Path("dataset.csv")
df = pd.read_csv(DATA_PATH)

# shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Strip whitespace and standardize
df["text"] = df["text"].astype(str).str.strip()
df["intent"] = df["intent"].astype(str).str.strip()

# Label distribution
label_counts = df["intent"].value_counts()
display(label_counts)


#split dataset 
X = df["text"]
y = df["intent"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

intent
escalation_policy     97
code_review_policy    96
onboarding_guide      96
deployment_process    95
team_structure        95
employees_info        87
jira_ticket_status    87
deployment_history    86
dev_env_setup         86
out_of_scope          80
Name: count, dtype: int64

Train: 724, Test: 181


In [13]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        # accept Series/list/array â†’ return list[str]
        s = pd.Series(X, dtype="object")
        return s.apply(self._clean).tolist()
    @staticmethod
    def _clean(t):
        t = t.lower().strip()
        return t

In [14]:

class EmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        self.scaler = StandardScaler(with_mean=False)  # keep sparse-compatible

    def fit(self, X, y=None):
        # Fit scaler on embeddings of training data
        texts = pd.Series(X).astype(str).tolist()
        emb = self.model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
        self.scaler.fit(emb)
        return self

    def transform(self, X):
        texts = pd.Series(X).astype(str).tolist()      # <-- fixes KeyError
        emb = self.model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
        emb = self.scaler.transform(emb)
        # return sparse to hstack with TF-IDF
        return csr_matrix(np.nan_to_num(emb))


# Word-level with stopwords (good baseline)
word_tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    ngram_range=(1,3),
    max_features=20000,
)



# Build the combined feature maker
features = FeatureUnion([
    ("tfidf",  word_tfidf),           # or char_tfidf
    ("embed",  EmbeddingVectorizer()),
], n_jobs=1)  # keep 1 if GPU/torch is in play

# Full pipeline = features + classifier
pipeline = Pipeline([
    ("prep", TextPreprocessor()),
    ("features", features),
    ("clf", LogisticRegression(max_iter=1000, solver="lbfgs", multi_class="multinomial", random_state=42)),
])


In [15]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))



                    precision    recall  f1-score   support

code_review_policy       0.94      0.79      0.86        19
deployment_history       0.83      0.88      0.86        17
deployment_process       0.89      0.89      0.89        19
     dev_env_setup       0.88      0.88      0.88        17
    employees_info       0.75      0.83      0.79        18
 escalation_policy       0.90      1.00      0.95        19
jira_ticket_status       0.94      0.83      0.88        18
  onboarding_guide       0.86      0.95      0.90        19
      out_of_scope       1.00      1.00      1.00        16
    team_structure       0.82      0.74      0.78        19

          accuracy                           0.88       181
         macro avg       0.88      0.88      0.88       181
      weighted avg       0.88      0.88      0.88       181

