In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from tqdm import tqdm
import joblib
import os
from datetime import datetime

In [None]:
# ------------------ Load Base Data ------------------
df = pd.read_csv("Data_Final.csv")
df.columns = df.columns.str.strip()
df = df.dropna(subset=["SampleName", "MasterTestCode"])

In [None]:
# ------------------ Feature Engineering: Final with Time & Count Weights + SampleName Text ------------------
# Ensure binary columns
for col in ["FeBase", "Destruct", "IsLarge"]:
    df[col] = df[col].fillna(0).astype(int)

# Load dependency info
strong_deps = pd.read_csv("strong_test_dependencies.csv")
strong_counts = strong_deps.groupby("Test1").size().reset_index(name="StrongDepCount")
df = df.merge(strong_counts, left_on="MasterTestCode", right_on="Test1", how="left")
df["StrongDepCount"] = df["StrongDepCount"].fillna(0)

# Mean physical attributes by MasterTestCode
avg_phys = df.groupby("MasterTestCode")[["FeBase", "Destruct", "IsLarge"]].mean().reset_index()
df = df.merge(avg_phys, on="MasterTestCode", suffixes=("", "_mean"))

# --- Time-based Features ---
df["MaxDate"] = pd.to_datetime(df["MaxDate"], errors="coerce")
now = pd.Timestamp.now()
df["TestAgeDays"] = (now - df["MaxDate"]).dt.days.clip(lower=1)
df["TimeWeight"] = 1 / df["TestAgeDays"]

# --- Count-based Features ---
df["TestImportance"] = df["TestCount"] * df["TimeWeight"]
df["LogTotalCount"] = np.log1p(df["TotalCount"])

# --- Weighted Feature Columns ---
df["WF_FeBase"] = df["FeBase"] * df["TestImportance"]
df["WF_Destruct"] = df["Destruct"] * df["TestImportance"]
df["WF_IsLarge"] = df["IsLarge"] * df["TestImportance"]
df["WF_StrongDep"] = df["StrongDepCount"] * df["TestImportance"]

# --- Group by SampleName ---
sample_features = df.groupby("SampleName").agg({
    "MasterTestCode": lambda x: " ".join(str(i) for i in x.dropna()),
    "WF_FeBase": "mean",
    "WF_Destruct": "mean",
    "WF_IsLarge": "mean",
    "WF_StrongDep": "mean",
    "FeBase_mean": "mean",
    "Destruct_mean": "mean",
    "IsLarge_mean": "mean",
    "LogTotalCount": "mean"
})

sample_features.rename(columns={
    "WF_FeBase": "FeBase_weighted",
    "WF_Destruct": "Destruct_weighted",
    "WF_IsLarge": "IsLarge_weighted",
    "WF_StrongDep": "StrongDepCount_weighted"
}, inplace=True)

# --- TF-IDF of SampleName (for similarity like پیچ m24 vs m42) ---
sample_features["SampleName"] = sample_features.index.astype(str)
sample_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
X_sample_name_text = sample_vectorizer.fit_transform(sample_features["SampleName"])

print("✅ فیچرهای نهایی با زمان، تکرار و متن SampleName ساخته شدند.")


In [None]:
# ------------------ Vectorize SampleName ------------------
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), max_features=1000)
X_text = vectorizer.fit_transform(df["SampleName"].astype(str))

In [None]:
from scipy.sparse import hstack, csr_matrix

# 1. TF-IDF از MasterTestCode (ویژگی‌های متنی مربوط به تست‌ها)
X_text = vectorizer.transform(sample_features["MasterTestCode"])

# 2. ویژگی‌های عددی نهایی
X_numeric = sample_features[[
    "FeBase_weighted", "Destruct_weighted", "IsLarge_weighted",
    "StrongDepCount_weighted",
    "FeBase_mean", "Destruct_mean", "IsLarge_mean",
    "LogTotalCount"
]].fillna(0).values
X_numeric_sparse = csr_matrix(X_numeric)

# 3. ویژگی متنی SampleName (TF-IDF)
# (فرض بر اینکه sample_vectorizer و X_sample_name_text از سل قبلی ساخته شدن)

# 4. ترکیب نهایی همه فیچرها
X_all = hstack([X_sample_name_text, X_text, X_numeric_sparse])

print(f"🔢 شکل نهایی فیچرها: {X_all.shape}")


In [None]:
# ------------------ Prepare Labels ------------------
grouped = df.groupby("SampleName")["MasterTestCode"].apply(set).reset_index()
mlb = MultiLabelBinarizer()
y_all = mlb.fit_transform(grouped["MasterTestCode"])

In [None]:
# ------------------ Filter Rare Labels Before Training ------------------
label_counts = df.groupby("MasterTestCode")["SampleName"].nunique()
rare_labels = label_counts[label_counts < 15].index  # ← آستانه: کمتر از 10 نمونه

print(f"✅ تعداد لیبل‌های حذف‌شده: {len(df_filtered)}")
df_filtered = df[~df["MasterTestCode"].isin(rare_labels)].copy()
print(f"✅ تعداد لیبل‌های حذف‌شده: {len(df_filtered)}")
print(f"✅ تعداد لیبل‌های حذف‌شده: {len(rare_labels)}")

In [None]:
# ------------------ گروه‌بندی داده‌ها ------------------
grouped = df.groupby("SampleName")["MasterTestCode"].apply(set).reset_index()
mlb = MultiLabelBinarizer()
y_all = mlb.fit_transform(grouped["MasterTestCode"])

In [None]:
# ------------------ انتخاب SampleName‌های یکتا و استخراج ویژگی ------------------
df_grouped = df.drop_duplicates(subset="SampleName").set_index("SampleName").loc[grouped["SampleName"]].reset_index()

# TF-IDF از SampleName
vectorizer = TfidfVectorizer(max_features=100)
X_text = vectorizer.fit_transform(df_grouped["SampleName"].astype(str))

# ویژگی‌های عددی از high_dependency_tests با وزن‌دهی زمانی
X_numeric = sample_features[[ 
    "FeBase_weighted", "Destruct_weighted", "IsLarge_weighted",
    "StrongDepCount_weighted",
    "FeBase_mean", "Destruct_mean", "IsLarge_mean",
    "LogTotalCount"
]].fillna(0).values

# ترکیب ویژگی‌ها
X_all = np.hstack([X_text.toarray(), X_numeric])


In [None]:
# ------------------ تقسیم‌بندی ------------------
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42)

In [None]:
# ------------------ آموزش مدل به ازای هر لیبل ------------------
models = []
print("🎯 شروع آموزش مدل LightGBM برای هر برچسب...")
for i in tqdm(range(y_train.shape[1]), desc="Training labels"):
    y_label = y_train[:, i]
    model = LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_label)
    models.append(model)
    
# ------------------ Save Artifacts ------------------
os.makedirs("model_output_new", exist_ok=True)
joblib.dump(models, "model_output_new/lightgbm_models.pkl")
joblib.dump(vectorizer, "model_output_new/vectorizer.pkl")
joblib.dump(mlb, "model_output_new/label_binarizer.pkl")