In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from tqdm import tqdm
import joblib
import os
from datetime import datetime

In [None]:
# ------------------ Load Base Data ------------------
df = pd.read_csv("Data_Final.csv")
df.columns = df.columns.str.strip()
df = df.dropna(subset=["SampleName", "MasterTestCode"])

In [None]:
# ------------------ Feature Engineering: Final with Time & Count Weights + SampleName Text ------------------
# Ensure binary columns
for col in ["FeBase", "Destruct", "IsLarge"]:
    df[col] = df[col].fillna(0).astype(int)

# Load dependency info
strong_deps = pd.read_csv("strong_test_dependencies.csv")
strong_counts = strong_deps.groupby("Test1").size().reset_index(name="StrongDepCount")
df = df.merge(strong_counts, left_on="MasterTestCode", right_on="Test1", how="left")
df["StrongDepCount"] = df["StrongDepCount"].fillna(0)

# Mean physical attributes by MasterTestCode
avg_phys = df.groupby("MasterTestCode")[["FeBase", "Destruct", "IsLarge"]].mean().reset_index()
df = df.merge(avg_phys, on="MasterTestCode", suffixes=("", "_mean"))

# --- Time-based Features ---
df["MaxDate"] = pd.to_datetime(df["MaxDate"], errors="coerce")
now = pd.Timestamp.now()
df["TestAgeDays"] = (now - df["MaxDate"]).dt.days.clip(lower=1)
df["TimeWeight"] = 1 / df["TestAgeDays"]

# --- Count-based Features ---
df["TestImportance"] = df["TestCount"] * df["TimeWeight"]
df["LogTotalCount"] = np.log1p(df["TotalCount"])

# --- Weighted Feature Columns ---
df["WF_FeBase"] = df["FeBase"] * df["TestImportance"]
df["WF_Destruct"] = df["Destruct"] * df["TestImportance"]
df["WF_IsLarge"] = df["IsLarge"] * df["TestImportance"]
df["WF_StrongDep"] = df["StrongDepCount"] * df["TestImportance"]

# --- Group by SampleName ---
sample_features = df.groupby("SampleName").agg({
    "MasterTestCode": lambda x: " ".join(str(i) for i in x.dropna()),
    "WF_FeBase": "mean",
    "WF_Destruct": "mean",
    "WF_IsLarge": "mean",
    "WF_StrongDep": "mean",
    "FeBase_mean": "mean",
    "Destruct_mean": "mean",
    "IsLarge_mean": "mean",
    "LogTotalCount": "mean"
})

sample_features.rename(columns={
    "WF_FeBase": "FeBase_weighted",
    "WF_Destruct": "Destruct_weighted",
    "WF_IsLarge": "IsLarge_weighted",
    "WF_StrongDep": "StrongDepCount_weighted"
}, inplace=True)

# --- TF-IDF of SampleName (for similarity like پیچ m24 vs m42) ---
sample_features["SampleName"] = sample_features.index.astype(str)
sample_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2))
X_sample_name_text = sample_vectorizer.fit_transform(sample_features["SampleName"])

print("✅ فیچرهای نهایی با زمان، تکرار و متن SampleName ساخته شدند.")


In [None]:
# ------------------ Vectorize SampleName ------------------
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), max_features=1000)
X_text = vectorizer.fit_transform(df["SampleName"].astype(str))

In [None]:
from scipy.sparse import hstack, csr_matrix

# 1. TF-IDF از MasterTestCode (ویژگی‌های متنی مربوط به تست‌ها)
X_text = vectorizer.transform(sample_features["MasterTestCode"])

# 2. ویژگی‌های عددی نهایی
X_numeric = sample_features[[
    "FeBase_weighted", "Destruct_weighted", "IsLarge_weighted",
    "StrongDepCount_weighted",
    "FeBase_mean", "Destruct_mean", "IsLarge_mean",
    "LogTotalCount"
]].fillna(0).values
X_numeric_sparse = csr_matrix(X_numeric)

# 3. ویژگی متنی SampleName (TF-IDF)
# (فرض بر اینکه sample_vectorizer و X_sample_name_text از سل قبلی ساخته شدن)

# 4. ترکیب نهایی همه فیچرها
X_all = hstack([X_sample_name_text, X_text, X_numeric_sparse])

print(f"🔢 شکل نهایی فیچرها: {X_all.shape}")


In [None]:
# ------------------ Prepare Labels ------------------
grouped = df.groupby("SampleName")["MasterTestCode"].apply(set).reset_index()
mlb = MultiLabelBinarizer()
y_all = mlb.fit_transform(grouped["MasterTestCode"])

In [None]:
# ------------------ گروه‌بندی داده‌ها ------------------
grouped = df.groupby("SampleName")["MasterTestCode"].apply(set).reset_index()
mlb = MultiLabelBinarizer()
y_all = mlb.fit_transform(grouped["MasterTestCode"])

In [None]:
# ------------------ انتخاب SampleName‌های یکتا و استخراج ویژگی ------------------
df_grouped = df.drop_duplicates(subset="SampleName").set_index("SampleName").loc[grouped["SampleName"]].reset_index()

# TF-IDF از SampleName
vectorizer = TfidfVectorizer(max_features=100)
X_text = vectorizer.fit_transform(df_grouped["SampleName"].astype(str))

# ویژگی‌های عددی از high_dependency_tests با وزن‌دهی زمانی
X_numeric = sample_features[[
    "FeBase_weighted", "Destruct_weighted", "IsLarge_weighted",
    "StrongDepCount_weighted",
    "FeBase_mean", "Destruct_mean", "IsLarge_mean",
    "LogTotalCount"
]].fillna(0).values

X_all = np.hstack([X_text.toarray(), X_numeric.to_numpy()])

In [None]:
# ------------------ تقسیم‌بندی ------------------
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42)

In [None]:
# ------------------ آموزش مدل به ازای هر لیبل ------------------
models = []
print("🎯 شروع آموزش مدل LightGBM برای هر برچسب...")
for i in tqdm(range(y_train.shape[1]), desc="Training labels"):
    y_label = y_train[:, i]
    model = LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_label)
    models.append(model)
    
# ------------------ Save Artifacts ------------------
os.makedirs("model_output_new", exist_ok=True)
joblib.dump(models, "model_output_new/lightgbm_models.pkl")
joblib.dump(vectorizer, "model_output_new/vectorizer.pkl")
joblib.dump(mlb, "model_output_new/label_binarizer.pkl")

In [None]:
# ------------------ Save Artifacts ------------------
os.makedirs("model_output", exist_ok=True)
joblib.dump(models, "model_output/lightgbm_models.pkl")
joblib.dump(vectorizer, "model_output/vectorizer.pkl")
joblib.dump(mlb, "model_output/label_binarizer.pkl")

In [None]:
df = pd.read_csv("Data_Final.csv")
df.columns = df.columns.str.strip()

# حذف رکوردهای بدون نام نمونه یا کد آزمون
df = df.dropna(subset=["SampleName", "MasterTestCode"])

# جدا کردن ۱۰٪ به‌صورت تصادفی
df_sampled = df.sample(frac=0.1, random_state=42)

# ذخیره فایل تست جدا شده
df_sampled.to_csv("test_subset.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, hamming_loss, label_ranking_average_precision_score
from tqdm import tqdm
import joblib

# ------------------ Load Models & Tools ------------------
print("📦 بارگذاری مدل‌ها و ابزارها...")
models = joblib.load("model_output/lightgbm_models.pkl")
vectorizer = joblib.load("model_output/vectorizer.pkl")  # fitted on SampleName
mlb = joblib.load("model_output/label_binarizer.pkl")
print(f"✅ {len(models)} مدل بارگذاری شد.")

# ------------------ Load Test Data ------------------
print("📄 بارگذاری داده‌ی جدید...")
df = pd.read_csv("test_subset.csv")
df.columns = df.columns.str.strip()

# ------------------ Feature Engineering ------------------
print("🧠 ساخت فیچر برای هر SampleName...")

for col in ["FeBase", "Destruct", "IsLarge"]:
    df[col] = df[col].fillna(0).astype(int)

strong_deps = pd.read_csv("strong_test_dependencies.csv")
strong_counts = strong_deps.groupby("Test1").size().reset_index(name="StrongDepCount")
df = df.merge(strong_counts, left_on="MasterTestCode", right_on="Test1", how="left")
df["StrongDepCount"] = df["StrongDepCount"].fillna(0)

avg_phys = df.groupby("MasterTestCode")[["FeBase", "Destruct", "IsLarge"]].mean().reset_index()
df = df.merge(avg_phys, on="MasterTestCode", suffixes=("", "_mean"))

df_grouped = df.drop_duplicates(subset="SampleName").set_index("SampleName")
#sample_names = sorted(set(df_grouped.index) & set(mlb.classes_))
#df_grouped = df_grouped.loc[sample_names]

# ------------------ ساخت فیچر نهایی ------------------
X_text = vectorizer.transform(df_grouped.index.astype(str))
X_numeric = df_grouped[["FeBase", "Destruct", "IsLarge", "StrongDepCount",
                        "FeBase_mean", "Destruct_mean", "IsLarge_mean"]].fillna(0).values
X_numeric_sparse = csr_matrix(X_numeric)
X_new = hstack([X_text, X_numeric_sparse])
print(f"🔢 شکل نهایی فیچرها: {X_new.shape}")

# ------------------ Predict ------------------
print("🔍 شروع پیش‌بینی...")
y_pred_proba = np.zeros((X_new.shape[0], len(models)))
for i, model in tqdm(enumerate(models), total=len(models), desc="پیش‌بینی مدل‌ها"):
    try:
        y_pred_proba[:, i] = model.predict_proba(X_new)[:, 1]
    except ValueError as e:
        print(f"❌ خطا در مدل {i}: {e}")
        continue

# ------------------ Thresholding ------------------
threshold = 0.75
y_pred_bin = (y_pred_proba > threshold).astype(int)
predicted_labels = mlb.inverse_transform(y_pred_bin)

df_output = pd.DataFrame({
    "SampleName": df_grouped.index,
    "Predicted_Tests": predicted_labels
})
df_output.to_csv("predictions.csv", index=False)
print("✅ پیش‌بینی‌ها در predictions.csv ذخیره شد.")

# ------------------ Evaluation (اگر ground truth داری) ------------------
if "MasterTestCode" in df.columns:
    print("📊 شروع ارزیابی مدل...")
    label_series = df.groupby("SampleName")["MasterTestCode"].apply(lambda codes: list(set(codes)))
    y_true = mlb.transform(label_series.reindex(df_grouped.index).fillna("").apply(lambda x: [] if x == "" else x))

    print(f"🔍 LRAP: {label_ranking_average_precision_score(y_true, y_pred_proba):.4f}")
    print(f"🎯 Micro F1: {f1_score(y_true, y_pred_bin, average='micro'):.4f}")
    print(f"🎯 Micro Precision: {precision_score(y_true, y_pred_bin, average='micro'):.4f}")
    print(f"🎯 Micro Recall: {recall_score(y_true, y_pred_bin, average='micro'):.4f}")
    print(f"🎯 Hamming Loss: {hamming_loss(y_true, y_pred_bin):.4f}")


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt
import joblib

# Load label binarizer
#mlb = joblib.load("model_output/label_binarizer.pkl")

# Calculate per-label metrics
precisions = precision_score(y_true, y_pred_bin, average=None, zero_division=0)
recalls = recall_score(y_true, y_pred_bin, average=None, zero_division=0)
f1s = f1_score(y_true, y_pred_bin, average=None, zero_division=0)

label_metrics = pd.DataFrame({
    "Label": mlb.classes_,
    "Precision": precisions,
    "Recall": recalls,
    "F1 Score": f1s
})

label_metrics_sorted = label_metrics.sort_values("F1 Score", ascending=False).reset_index(drop=True)
top_labels = label_metrics_sorted.head(10)
bottom_labels = label_metrics_sorted.tail(10)

def plot_label_metrics(df, title):
    plt.figure(figsize=(12, 6))
    plt.barh(df["Label"], df["F1 Score"])
    plt.xlabel("F1 Score")
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_label_metrics(top_labels, "🎯 بهترین 10 برچسب از نظر F1")
plot_label_metrics(bottom_labels, "⚠️ ضعیف‌ترین 10 برچسب از نظر F1")
