In [None]:
!pip install datasets scikit-learn lime

In [None]:
!pip install nltk


In [None]:
label_mapping = { 0: 'Cause-Effect-e2-e1', 1: 'Cause-Effect-e1-e2', 2: 'Component-Whole', 3: 'Component-Whole', 4: 'Content-Container', 5: 'Content-Container', 6: 'Entity-Destination', 7: 'Entity-Destination', 8: 'Entity-Origin', 9: 'Entity-Origin', 10: 'Instrument-Agency', 11: 'Instrument-Agency', 12: 'Member-Collection', 13: 'Member-Collection', 14: 'Message-Topic', 15: 'Message-Topic', 16: 'Product-Producer', 17: 'Product-Producer', 18: 'Other' }

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import os
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datasets import load_dataset
from openpyxl import load_workbook

# ========== تنظیمات NLTK ==========
def setup_nltk():
    """دانلود و تنظیم داده‌های مورد نیاز NLTK"""
    try:
        nltk_data_path = os.path.expanduser('~/nltk_data')
        if not os.path.exists(nltk_data_path):
            os.makedirs(nltk_data_path)

        nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
        nltk.download('wordnet', download_dir=nltk_data_path, quiet=True)
        nltk.download('omw-1.4', download_dir=nltk_data_path, quiet=True)
        nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_path, quiet=True)

        nltk.data.path.append(nltk_data_path)
    except Exception as e:
        print(f"خطا در تنظیم NLTK: {str(e)}")

setup_nltk()

# ========== توابع پردازش متن ==========
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        if pd.isna(text) or not str(text).strip():
            return ""
        text = str(text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text.lower().strip()

    def lemmatize_text(self, text):
        text = self.clean_text(text)
        if not text:
            return ""

        try:
            tokens = nltk.word_tokenize(text)
        except:
            tokens = text.split()

        lemmas = []
        for token in tokens:
            lemma = self.lemmatizer.lemmatize(token, pos='v')
            lemma = self.lemmatizer.lemmatize(lemma, pos='n')
            lemma = self.lemmatizer.lemmatize(lemma, pos='a')
            lemmas.append(lemma)

        return " ".join(lemmas)

# ========== پردازش داده‌ها ==========
def process_data():
    ds = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8", download_mode="force_redownload")

    train_data = ds["train"]




    # پردازش داده‌های تست
    test_df = pd.read_excel('contextual_paired_entities_V040129.xlsx')
    test_df['Sentence'] = test_df['Sentence'].fillna('')

    # ایجاد پیش‌پردازشگر
    txt_prs = TextPreprocessor()

    # پردازش متن‌های train و test
    train_texts = []
    train_labels = []

    label_mapping = {
        0: 'Cause-Effect',
        1: 'Cause-Effect',
        2: 'Component-Whole',
        3: 'Component-Whole',
        18: 'Other'
    }

    for row, label in zip(train_data, train_data['relation']):
        if label in label_mapping:
            text = get_terms_between_entities(row['sentence'])
            if text:
                processed_text = txt_prs.lemmatize_text(text)
                if processed_text:
                    train_texts.append(processed_text)
                    train_labels.append(label_mapping[label])

    # پردازش داده‌های تست
    test_df['cleaned_text'] = test_df['Sentence'].apply(
        lambda x: txt_prs.lemmatize_text(get_terms_between_entities(x))
    )
    test_df = test_df[test_df['cleaned_text'] != ""]

    # پردازش ترم‌های آنووا
    wb = load_workbook('Anova Terms Class Labels.xlsx')
    sheet_names = ['anova_terms_cause_effect', 'anova_terms_component_whole']
    important_terms = {}

    for sheet_name in sheet_names:
        class_name = sheet_name.replace('anova_terms_', '').replace('_', '-')
        df = pd.read_excel('Anova Terms Class Labels.xlsx', sheet_name=sheet_name)

        # پردازش 35 ترم اول Lemma (بدون تکراری)
        lemma_terms = df['Lemma'].dropna().apply(txt_prs.lemmatize_text).unique()[:35]
        important_terms[class_name] = set(lemma_terms)

    return train_texts, train_labels, test_df, important_terms

def get_terms_between_entities(sentence):
    try:
        terms = re.search(r'<e1>.*?</e1>(.*?)<e2>.*?</e2>', sentence)
        return terms.group(1).strip() if terms else ""
    except:
        return ""

# ========== آموزش مدل ==========
def train_and_predict(train_texts, train_labels, test_df, important_terms):
    # ایجاد دیتافریم آموزشی
    train_df = pd.DataFrame({
        'Text': train_texts,
        'Label': train_labels
    })

    # بالانس کردن داده‌های آموزشی
    min_class_samples = train_df['Label'].value_counts().min()
    balanced_train_df = pd.concat([
        train_df[train_df['Label'] == label].sample(min_class_samples, random_state=42)
        for label in train_df['Label'].unique()
    ])

    # چاپ اطلاعات داده‌های آموزشی و تست
    print("\nاطلاعات داده‌های آموزشی و تست:")
    print("="*50)
    print(f"تعداد نمونه‌های آموزشی: {len(balanced_train_df)}")
    print(f"تعداد نمونه‌های تست: {len(test_df)}")
    print("\nتوزیع کلاس‌ها در داده آموزشی:")
    print(balanced_train_df['Label'].value_counts())

    # آماده‌سازی داده‌ها
    all_terms = set()
    for terms in important_terms.values():
        all_terms.update(terms)

    vectorizer = CountVectorizer(vocabulary=list(all_terms), ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(balanced_train_df['Text']).toarray()
    y_train = balanced_train_df['Label']

    # آموزش مدل
    clf1 = RandomForestClassifier(random_state=42)
    clf2 = GradientBoostingClassifier(random_state=42)
    clf3 = LogisticRegression(random_state=42)
    voting_clf = VotingClassifier(estimators=[('rf', clf1), ('gb', clf2), ('lr', clf3)], voting='soft')
    voting_clf.fit(X_train, y_train)

    # پیش‌بینی روی داده تست
    X_test = vectorizer.transform(test_df['cleaned_text']).toarray()
    test_df['Predicted_Label'] = voting_clf.predict(X_test)
    test_df['Prediction_Probability'] = voting_clf.predict_proba(X_test).max(axis=1)
    test_df['Predicted_Label'] = test_df['Predicted_Label'].apply(
        lambda x: 'others' if x == 'Other' else x)

    # چاپ اطلاعات پیش‌بینی
    print("\nتوزیع کلاس‌ها در پیش‌بینی داده تست:")
    print(test_df['Predicted_Label'].value_counts())
    print("="*50)

    # ذخیره نتایج
    test_df.to_excel('output_040225.xlsx', index=False)

# اجرای اصلی
if __name__ == "__main__":
    train_texts, train_labels, test_df, important_terms = process_data()
    train_and_predict(train_texts, train_labels, test_df, important_terms)

In [None]:
import pandas as pd
import inflect
import re

# Load the Excel file
excel_file = 'output_040225.xlsx'
df = pd.read_excel(excel_file)

# Extract the entities and labels
df['Entity1'] = df['e1']
df['Entity2'] = df['e2']

# Filter out rows where Entity1 or Entity2 is empty
df = df.dropna(subset=['Entity1', 'Entity2'])

# Initialize inflect engine
p = inflect.engine()

# Normalize entities to singular form
def normalize_entity(entity):
    words = entity.split()  # بررسی تعداد کلمات در Entity
    if len(words) > 1:  # اگر شامل چند کلمه باشد، آن را دست‌نخورده نگه داریم
        return entity
    singular = p.singular_noun(entity)
    return singular if isinstance(singular, str) else entity  # فقط اسم‌های تکی را نرمال کنیم

# Apply normalization safely
df['Entity1'] = df['Entity1'].apply(normalize_entity)
df['Entity2'] = df['Entity2'].apply(normalize_entity)

# Clean Sentence column by removing specified tags
df['Cleaned_Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'</?e\d?>', '', x) if pd.notna(x) else x)
# df['Cleaned_Sentence'] = df['cleaned_text']

# Select relevant columns for Cytoscape and rename columns to source and target
cytoscape_df = df[['Cleaned_Sentence', 'Entity1', 'Entity2', 'Predicted_Label']]
cytoscape_df.columns = ['sentence', 'source', 'target', 'label']

# Filter out rows where label is 'others'
cytoscape_df = cytoscape_df[cytoscape_df['label'] != 'others']

# Save the data to a CSV file
cytoscape_df.to_csv('data_drugs_Betweens_twoClass_V040225.csv', index=False)

In [None]:
import json
import openai
import time
import pandas as pd

client = openai.OpenAI(
    api_key='my-key'
)

# تابع اصلی برای تشخیص رابطه
def detection_with_llm(ent1, ent2, sentence, mode='2class'):
    if mode == '2class':
        prompt = f"""
Determine which of the two following semantic relationships best describes the relationship between the two highlighted drug entities in the sentence.

Available relation types:

1. cause-effect:
   - Definition: One drug causes, leads to, or affects the function or effectiveness of the other.
   - Example: "Combining warfarin with aspirin can increase the risk of bleeding."
     → warfarin <cause-effect> aspirin

2. component-whole:
   - Definition: One drug is a part or ingredient of a drug combination or formulation.
   - Example: "Vitamin B complex includes B1, B2, and B6."
     → B1 <component-whole> Vitamin B complex

Sentence: "{sentence}"
Entity 1 (e1): "{ent1}"
Entity 2 (e2): "{ent2}"

Choose the most suitable label: cause-effect or component-whole.
Only return one of these labels. No explanation needed.
"""
    elif mode == 'binary_causal':
        prompt = f"""
Check if the following sentence suggests a causal relationship between the two highlighted drug entities.

Definition:
- cause-effect: One drug causes, leads to, or affects the function, activity, or effect of the other.
  Example: "Taking rifampin can reduce the effectiveness of oral contraceptives."
    → rifampin <cause-effect> oral contraceptives

If such a causal relationship exists, return "cause-effect".
Otherwise, return "others".

Sentence: "{sentence}"
Entity 1 (e1): "{ent1}"
Entity 2 (e2): "{ent2}"

Return only: cause-effect or others. No explanation.
"""
    else:
        raise ValueError("Invalid mode selected. Choose '2class' or 'binary_causal'.")

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": '''
                You are an expert in biomedical language understanding. You carefully assess relationships between pharmaceutical entities based on the context of the sentence and your knowledge of drug interactions and classifications.
            '''},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0,
        timeout=120,
    )

    try:
        llm_label = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error parsing response: {e}")
        llm_label = "Error"
    print(prompt + '\n')
    time.sleep(1)
    return llm_label.lower()

# تابع اجرای اصلی
def apply_stance_detection(input_path, output_path, mode):
    df = pd.read_csv(input_path)
    df['llm_label'] = None
    for i, row in df.iterrows():
        llm_label = detection_with_llm(row['source'], row['target'], row['sentence'], mode=mode)
        df.at[i, 'llm_label'] = llm_label
        print(f'Row {i}: {llm_label}\n')
    df.to_csv(output_path, index=False)
    print(f"\n✔ Results saved to: {output_path}")

In [None]:
apply_stance_detection("data_drugs_Betweens_twoClass_V040225.csv", "result_data_drugs_Betweens_twoClass_V040225.csv", "2class")

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import os

def plot_confusion_matrix(
    csv_path,
    label_col='label',
    pred_col='llm_label',
    classes=None,
    title='Confusion Matrix',
    figsize=(10, 8),
    output_dir='./outputs',
    save_plot=True,
    save_report=True,
):
    """
    Plot a confusion matrix from a CSV file containing labels and predictions.

    Args:
        csv_path (str): Path to the CSV file.
        label_col (str): Name of the column containing true labels.
        pred_col (str): Name of the column containing predicted labels.
        classes (list): List of possible class labels (if None, inferred from data).
        title (str): Title of the plot.
        figsize (tuple): Size of the figure (width, height).
        output_dir (str): Directory to save outputs.
        save_plot (bool): Whether to save the plot as PNG.
        save_report (bool): Whether to save the classification report as TXT.
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Read CSV and preprocess labels
    df = pd.read_csv(csv_path)
    df[label_col] = df[label_col].str.lower()
    df[pred_col] = df[pred_col].str.lower()

    # Define classes if not provided
    if classes is None:
        classes = sorted(set(df[label_col].unique()) | set(df[pred_col].unique()))

    # Filter out invalid labels
    valid_labels = set(classes)
    df = df[df[label_col].isin(valid_labels) & df[pred_col].isin(valid_labels)]

    # Generate confusion matrix
    cm = confusion_matrix(df[label_col], df[pred_col], labels=classes)

    # Plot heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=classes,
        yticklabels=classes,
    )
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Save plot
    if save_plot:
        plot_path = os.path.join(output_dir, 'confusion_matrix.png')
        plt.savefig(plot_path, bbox_inches='tight')

    plt.show()

    # Generate and save classification report
    if save_report:
        report = classification_report(
            df[label_col],
            df[pred_col],
            target_names=classes,
            zero_division=0,
        )
        report_path = os.path.join(output_dir, 'classification_report.txt')
        with open(report_path, 'w') as f:
            f.write(report)
        print(f"✅ Classification report saved to: {report_path}")

    # Save confusion matrix as CSV
    cm_df = pd.DataFrame(cm, index=classes, columns=classes)
    cm_csv_path = os.path.join(output_dir, 'confusion_matrix.csv')
    cm_df.to_csv(cm_csv_path)
    print(f"✅ Confusion matrix saved to: {cm_csv_path}\n")

    return cm_df

In [None]:
import pandas as pd

# کلاس‌های شما
CLASSES = [
    'cause-effect', 'component-whole'
]

# استخراج ماتریس سردرگمی
cm_df = plot_confusion_matrix(
    csv_path='result_data_drugs_Betweens_twoClass_V040225.csv',
    label_col='llm_label',
    pred_col='label',
    classes=CLASSES,
    title='Drug Relations Confusion Matrix (LLM vs Model)',
    output_dir='./confusion_matrix_drug_betweens_twoClass_V040225',
)

# تبدیل ماتریس به edge list
edges = []
for i, source in enumerate(CLASSES):
    for j, target in enumerate(CLASSES):
        weight = cm_df.iloc[i, j]
        if weight > 0:  # حذف مقادیر صفر
            edges.append([source, target, weight])

# تبدیل به DataFrame و ذخیره در CSV برای Cytoscape
edges_df = pd.DataFrame(edges, columns=["source", "target", "label"])
edges_df.to_csv("confusion_matrix_twoClass_V040225.csv", index=False)

In [None]:
import pandas as pd

# خواندن فایل CSV
file_path = "result_data_drugs_Betweens_twoClass_V040225.csv"
df = pd.read_csv(file_path)

# فیلتر کردن داده‌ها
df_cause_effect = df[(df["label"] == "Cause-Effect") & (df["llm_label"] == "component-whole")]
df_others = df[(df["label"] == "Component-Whole") & (df["llm_label"] == "cause-effect")]

# ایجاد یک فایل اکسل با دو شیت
output_path = "reports_twoClass_filtered_withoutner_all_3_V040225.xlsx"

with pd.ExcelWriter(output_path) as writer:
    df_cause_effect.to_excel(writer, sheet_name="Cause-Effect", index=False)
    df_others.to_excel(writer, sheet_name="Component-Whole", index=False)

print(f"✅ فایل {output_path} ایجاد شد!")