In [53]:
# Download datasets

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import re
import joblib

from sklearn.model_selection import StratifiedKFold, cross_validate

In [2]:
# Download datasets

dataset = pd.read_csv('Train_dataset_building/Train_dataset.csv')


In [25]:
# Drop dublicates

dataset = dataset.drop_duplicates(subset="Item")

In [56]:
# Transform Items into vectors

vectorizer = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    lowercase=True
)

X = vectorizer.fit_transform(dataset['Item'])
y = dataset['Tag']

joblib.dump(vectorizer, "vectorizer.joblib")

['vectorizer.joblib']

In [33]:
# Raw columns
# Raw columns
X = dataset["Item"].astype(str)
y = dataset["Tag"].astype(str)

# Split RAW text (not vectorized yet)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Pipeline: vectorize -> classify
clf = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char_wb",     # great for names with punctuation/typos
        ngram_range=(3, 5),
        lowercase=True
    )),
    ("lr", LogisticRegression(
        max_iter=2000
    ))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

   companies       1.00      1.00      1.00      4000
       dates       1.00      1.00      1.00      1236
       goods       0.99      0.92      0.95      1119
    location       0.97      0.90      0.93      1988
  random_str       0.95      1.00      0.97      4000

    accuracy                           0.98     12343
   macro avg       0.98      0.96      0.97     12343
weighted avg       0.98      0.98      0.98     12343



### Model diagnostic

In [43]:
# Leakage test

def leakage_rate(df):
    hits = 0
    for item, tag in zip(df["Item"].astype(str), df["Tag"].astype(str)):
        if re.search(rf"\b{re.escape(tag)}\b", item, flags=re.I):
            hits += 1
    return hits / len(df)

print("Leakage rate:", leakage_rate(dataset))

Leakage rate: 3.240913289364943e-05


In [30]:
# Any overlapping between test-train sets

train_items = set(X_train)
test_items  = set(X_test)
overlap = train_items & test_items

print("Exact overlap count:", len(overlap))
print("Exact overlap % of test:", len(overlap) / len(test_items))


Exact overlap count: 0
Exact overlap % of test: 0.0


In [31]:
# Suffle lables 

y_train_shuffled = np.random.permutation(y_train)
clf.fit(X_train, y_train_shuffled)

print("Accuracy with shuffled labels:", clf.score(X_test, y_test))

Accuracy with shuffled labels: 0.3807016122498582


In [41]:
# Class separation

labels = y.unique()
X_vec = vectorizer.fit_transform(X)
for cls in labels:
    y_bin = (y == cls).astype(int)
    clf = LogisticRegression(max_iter=2000)
    clf.fit(X_vec, y_bin)
    probs = clf.predict_proba(X_vec)[:, 1]
    auc = roc_auc_score(y_bin, probs)
    print(cls, "AUC:", round(auc, 3))



companies AUC: 1.0
dates AUC: 1.0
goods AUC: 0.999
random_str AUC: 1.0
location AUC: 0.998


#### Cross-validation test

In [52]:
# Run cross-val test pipelines

X = dataset["Item"].astype(str)
y = dataset["Tag"].astype(str)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3, 5),
        lowercase=True
    )),
    ("lr", LogisticRegression(max_iter=2000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    pipe, X, y,
    cv=cv,
    scoring=["accuracy", "f1_macro", "f1_weighted"],
    n_jobs=-1,
    return_train_score=False
)

for k, v in scores.items():
    if k.startswith("test_"):
        print(k, "mean=", np.mean(v).round(4), "std=", np.std(v).round(4))


test_accuracy mean= 0.9728 std= 0.0007
test_f1_macro mean= 0.9672 std= 0.0011
test_f1_weighted mean= 0.9725 std= 0.0008


In [55]:
joblib.dump(pipe, "model.joblib")

['model.joblib']

Exception ignored in: <function ResourceTracker.__del__ at 0x102e21bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x102519bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106bb1bc0>
Traceback (most recent call last