In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    f1_score, precision_score, recall_score, hamming_loss,
    label_ranking_average_precision_score
)
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.sparse import hstack, csr_matrix
from lightgbm import LGBMClassifier, early_stopping
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
from tqdm import tqdm
import gc

In [None]:
# ------------------ Load & Clean Data ------------------
df = pd.read_csv("Data_Final.csv")
df['TotalCount'] = pd.to_numeric(df['TotalCount'], errors='coerce')
df = df.dropna(subset=['TotalCount'])

In [None]:
# ------------------ Filter Based on TotalCount ------------------
group_total = df.groupby("MasterTestCode")["TotalCount"].first()
threshold = group_total.quantile(0.2)  # Customize as needed
print(f"\n🎯 آستانه فیلتر: {threshold:.0f}")

valid_codes = group_total[group_total >= threshold].index
df = df[df["MasterTestCode"].isin(valid_codes)].copy()

In [None]:
# ------------------ Filter ------------------
# فیلتر با آستانه قابل تنظیم
# (مثلاً صدک 20٪ برای حذف کلاس‌های کم‌نمونه)
threshold = group_total.quantile(0.2)
print(f"\n🎯 آستانه فیلتر: {threshold:.0f}")

valid_codes = group_total[group_total >= threshold].index
df_filtered = df[df["MasterTestCode"].isin(valid_codes)].copy()

In [None]:
# ------------------ Feature Engineering ------------------
for col in ["FeBase", "Destruct", "IsLarge"]:
    df[col] = df[col].fillna(0).astype(int)

# Load dependency info
strong_deps = pd.read_csv("strong_test_dependencies.csv")
strong_counts = strong_deps.groupby("Test1").size().reset_index(name="StrongDepCount")
df = df.merge(strong_counts, left_on="MasterTestCode", right_on="Test1", how="left")
df["StrongDepCount"] = df["StrongDepCount"].fillna(0)

# Mean physical attributes by MasterTestCode
avg_phys = df.groupby("MasterTestCode")[["FeBase", "Destruct", "IsLarge"]].mean().reset_index()
df = df.merge(avg_phys, on="MasterTestCode", suffixes=("", "_mean"))

In [None]:
# Time-based Features
df["MaxDate"] = pd.to_datetime(df["MaxDate"], errors="coerce")
now = pd.Timestamp.now()
df["TestAgeDays"] = (now - df["MaxDate"]).dt.days.clip(lower=1)
df["TimeWeight"] = 1 / df["TestAgeDays"]

# Count-based Features
df["TestImportance"] = df["TestCount"] * df["TimeWeight"]
df["LogTotalCount"] = np.log1p(df["TotalCount"])

# Weighted Features
df["WF_FeBase"] = df["FeBase"] * df["TestImportance"]
df["WF_Destruct"] = df["Destruct"] * df["TestImportance"]
df["WF_IsLarge"] = df["IsLarge"] * df["TestImportance"]
df["WF_StrongDep"] = df["StrongDepCount"] * df["TestImportance"]

In [None]:
# Grouping
sample_features = df.groupby("SampleName").agg({
    "MasterTestCode": lambda x: list(x.dropna()),
    "WF_FeBase": "mean",
    "WF_Destruct": "mean",
    "WF_IsLarge": "mean",
    "WF_StrongDep": "mean",
    "FeBase_mean": "mean",
    "Destruct_mean": "mean",
    "IsLarge_mean": "mean",
    "LogTotalCount": "mean"
}).rename(columns={
    "WF_FeBase": "FeBase_weighted",
    "WF_Destruct": "Destruct_weighted",
    "WF_IsLarge": "IsLarge_weighted",
    "WF_StrongDep": "StrongDepCount_weighted"
}).reset_index()

# حذف سطرهایی که لیست برچسبشون خالیه
sample_features = sample_features[sample_features["MasterTestCode"].apply(lambda x: len(x) > 0)].reset_index(drop=True)

In [None]:
sample_features["SampleName"] = sample_features["SampleName"].astype(str)

In [None]:
# 2. ساخت TF-IDF
sample_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=20000)

X_text = sample_vectorizer.fit_transform(sample_features["SampleName"].astype(str))

In [None]:
# 3. استخراج ویژگی‌های عددی
X_numeric = sample_features.drop(columns=["MasterTestCode", "SampleName"])

In [None]:
# 4. تبدیل عددی‌ها به sparse
X_numeric_sparse = csr_matrix(X_numeric.values)

In [None]:
# 5. ترکیب ویژگی‌ها
X_final = hstack([X_numeric_sparse, X_text])

In [None]:
# ------------------ Binarize Labels ------------------
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(sample_features["MasterTestCode"])

In [None]:
# 6. Train/Test Split
sample_features["MasterTestCode"] = sample_features["MasterTestCode"].apply(
    lambda x: x if isinstance(x, list) else []
)

# ساخت encoder برای چندبرچسبی
mlb = MultiLabelBinarizer()

# باینری‌سازی لیبل‌ها
y = mlb.fit_transform(sample_features["MasterTestCode"])

# ------------------ Train/Test Split ------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

In [None]:
# ------------------ مقداردهی min_data_in_leaf ------------------
label_pos_counts = y.sum(axis=0)
valid_mask = label_pos_counts > 0
avg_positives_per_class = np.mean(label_pos_counts[valid_mask])
min_data_leaf_value = int(max(50, min(500, avg_positives_per_class * 0.2)))
print(f"🔧 مقدار تنظیم‌شده min_data_in_leaf: {min_data_leaf_value}")

# ------------------ آموزش مدل برای یک کلاس ------------------
n_classes = y.shape[1]

def train_model_for_class(i):
    y_i = y[:, i]
    if y_i.sum() < 5:
        return None

    X_train, X_val, y_train_i, y_val_i = train_test_split(
        X_final, y_i, test_size=0.2, random_state=42
    )

    model = LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42,
        n_jobs=2,
        is_unbalance=True,
        min_data_in_leaf=min_data_leaf_value
    )

    model.fit(
        X_train, y_train_i,
        eval_set=[(X_val, y_val_i)],
        eval_metric="binary_logloss",
        callbacks=[early_stopping(stopping_rounds=20)],
        verbose=-1
    )

    return model

# ------------------ آموزش موازی ------------------
print("🚀 شروع آموزش موازی مدل‌ها...")
models = Parallel(n_jobs=2)(  # بسته به RAM و CPU قابل افزایشه
    delayed(train_model_for_class)(i) for i in tqdm(range(n_classes))
)

# ------------------ فیلتر کلاس‌هایی که مدل ندارند ------------------
models = [m for m in models if m is not None]

# ------------------ ذخیره مدل و ابزارها ------------------
os.makedirs("model_output_new", exist_ok=True)
joblib.dump(models, "model_output_new/lightgbm_models.pkl")
joblib.dump(sample_vectorizer, "model_output_new/vectorizer.pkl")
joblib.dump(mlb, "model_output_new/label_binarizer.pkl")

print("✅ آموزش مدل‌ها با موفقیت به پایان رسید.")


In [None]:
# Model Training
#model = LogisticRegression(max_iter=1000)
#model.fit(X_train, y_train)

In [None]:
# Evaluation
y_pred = model.predict(X_test)
micro_f1 = f1_score(y_test, y_pred, average="micro")
micro_precision = precision_score(y_test, y_pred, average="micro")
micro_recall = recall_score(y_test, y_pred, average="micro")
lrap = label_ranking_average_precision_score(y_test, model.predict_proba(X_test))
hamming = hamming_loss(y_test, y_pred)

In [None]:
print("\n✅ نتایج مدل:")
print(f"Micro F1: {micro_f1:.4f}")
print(f"Micro Precision: {micro_precision:.4f}")
print(f"Micro Recall: {micro_recall:.4f}")
print(f"LRAP: {lrap:.4f}")
print(f"Hamming Loss: {hamming:.4f}")

In [None]:
# Plot Histogram
plt.figure(figsize=(14, 6))
df.groupby("MasterTestCode")["TotalCount"].first().hist(bins=30, log=True)
plt.xlabel("TotalCount")
plt.ylabel("تعداد کلاس‌ها (log)")
plt.title("پراکندگی TotalCount بر اساس MasterTestCode (بعد از فیلتر)")
plt.grid(True)
plt.tight_layout()
plt.show()