In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets/alaareda12/eng-data/lowercase_all_cleaned_output_no_additional_info (6) (4).xlsx


In [3]:
import os

# بيدور على الفايل في كل الـ input
for root, dirs, files in os.walk("/kaggle/input"):
    for file in files:
        print(os.path.join(root, file))

/kaggle/input/datasets/alaareda12/eng-data/lowercase_all_cleaned_output_no_additional_info (6) (4).xlsx


In [13]:
# ======================================================
# 1. Installing Libraries
# ======================================================
!pip install -q transformers datasets accelerate scikit-learn openpyxl

import pandas as pd
import numpy as np
import torch
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
import datasets

warnings.filterwarnings('ignore')

# ======================================================
# 2. Data Loading
# ======================================================
FILE_PATH = '/kaggle/input/datasets/alaareda12/eng-data/lowercase_all_cleaned_output_no_additional_info (6) (4).xlsx'

print(f"Loading Data from: {FILE_PATH}")
raw_df = pd.read_excel(FILE_PATH)

raw_df.dropna(subset=['Full_name', 'Symptoms', 'DonorID'], inplace=True)
raw_df['Symptoms'] = raw_df['Symptoms'].astype(str).str.strip()
raw_df['Full_name'] = raw_df['Full_name'].astype(str).str.strip().str.lower()

# ======================================================
# 3. Timeline Aggregation — ✅ أضفنا Type
# ======================================================
def create_patient_timeline(group):
    group = group.sort_values('Years_before_death', ascending=False)
    first_row = group.iloc[0]

    sex       = first_row['Sex']  if pd.notna(first_row['Sex'])  else "Unknown"
    age       = first_row['Age']  if pd.notna(first_row['Age'])  else "Unknown"
    typ       = first_row['Type'] if pd.notna(first_row['Type']) else "Unknown"  # ✅ جديد
    diagnosis = first_row['Full_name']

    events = []
    for _, row in group.iterrows():
        years = row['Years_before_death']
        sym   = row['Symptoms']
        if pd.notna(years):
            events.append(f"At {int(years)} years pre-death: {sym}")
        else:
            events.append(f"Observed: {sym}")

    # ✅ أضفنا Type في النص
    full_text = f"Patient ({sex}, Age {age}, Type {typ}). History: " + " -> ".join(events)
    return pd.Series({'text': full_text, 'label_name': diagnosis})

print("Aggregating patient timelines by DonorID...")
df_grouped = raw_df.groupby('DonorID').apply(create_patient_timeline).reset_index()

targets = ["alzheimer's disease", "control brain", "parkinson's disease",
           "progressive supranuclear palsy", "multiple system atrophy"]
df_final = df_grouped[df_grouped['label_name'].isin(targets)].copy()
print(f"Final Unique Patients: {len(df_final)}")
print(df_final['label_name'].value_counts())

le = LabelEncoder()
df_final['label'] = le.fit_transform(df_final['label_name'])

train_df, test_df = train_test_split(
    df_final, test_size=0.2, stratify=df_final['label'], random_state=0
)
print(f"Train size: {len(train_df)} | Test size: {len(test_df)}")

# ======================================================
# 4. Tokenization
# ======================================================
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

ds_train = datasets.Dataset.from_pandas(train_df).map(preprocess, batched=True)
ds_test  = datasets.Dataset.from_pandas(test_df).map(preprocess, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(le.classes_))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# ======================================================
# 5. Training Arguments
# ======================================================
args = TrainingArguments(
    output_dir="/kaggle/working/timeline_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=20,
    fp16=True,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none"
)

# ======================================================
# 6. Trainer
# ======================================================
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
)

print("Starting Training...")
trainer.train()

# ======================================================
# 7. Final Evaluation
# ======================================================
print("\n===== Final Evaluation Report =====")
preds = trainer.predict(ds_test)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = preds.label_ids

print(f"Final Accuracy: {accuracy_score(y_true, y_pred)*100:.2f}%")
print(classification_report(y_true, y_pred, target_names=le.classes_))

Loading Data from: /kaggle/input/datasets/alaareda12/eng-data/lowercase_all_cleaned_output_no_additional_info (6) (4).xlsx
Aggregating patient timelines by DonorID...
Final Unique Patients: 1329
label_name
alzheimer's disease               703
control brain                     341
parkinson's disease               133
progressive supranuclear palsy     91
multiple system atrophy            61
Name: count, dtype: int64
Train size: 1063 | Test size: 266


Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.decoder.bias               | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect id

Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy
1,3.483384,0.97107,0.838346
2,1.638865,0.8165,0.830827
3,1.431054,0.721066,0.87594
4,1.415827,0.682151,0.909774
5,1.215608,0.602142,0.913534
6,0.976646,0.577086,0.921053
7,0.794073,0.595576,0.921053
8,0.698344,0.577579,0.932331
9,0.625562,0.564204,0.928571
10,0.563874,0.641355,0.906015


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La


===== Final Evaluation Report =====


Final Accuracy: 93.23%
                                precision    recall  f1-score   support

           alzheimer's disease       0.96      0.98      0.97       141
                 control brain       1.00      1.00      1.00        68
       multiple system atrophy       0.73      0.67      0.70        12
           parkinson's disease       0.74      0.93      0.82        27
progressive supranuclear palsy       1.00      0.50      0.67        18

                      accuracy                           0.93       266
                     macro avg       0.88      0.81      0.83       266
                  weighted avg       0.94      0.93      0.93       266

