<a href="https://colab.research.google.com/github/biorsd/VIM-Polyp/blob/main/GUST_pathoIHC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d refikasultandogan/gust-colonpatho -p /content/Kaggle/
print('Data source import complete.')

!unzip /content/Kaggle/gust-colonpatho.zip -d /content/Kaggle/
print('Data source unzip complete.')



Dataset URL: https://www.kaggle.com/datasets/refikasultandogan/gust-colonpatho
License(s): unknown
Downloading gust-colonpatho.zip to /content/Kaggle
100% 7.29G/7.29G [06:08<00:00, 25.7MB/s]
100% 7.29G/7.29G [06:08<00:00, 21.3MB/s]
Data source import complete.
Archive:  /content/Kaggle/gust-colonpatho.zip
  inflating: /content/Kaggle/10x/10x/patho-patient1-polyp1-descending-nonneoplastic-hyperplastic-slideX10.tiff  
  inflating: /content/Kaggle/10x/10x/patho-patient1-polyp2-descending-nonneoplastic-hyperplastic-slideX10.tiff  
  inflating: /content/Kaggle/10x/10x/patho-patient1-polyp3-descending-nonneoplastic-hyperplastic-slideX10.tiff  
  inflating: /content/Kaggle/10x/10x/patho-patient10-polyp1-rectum-neoplastic-tubulovillous-slideX10_1.tiff  
  inflating: /content/Kaggle/10x/10x/patho-patient10-polyp1-rectum-neoplastic-tubulovillous-slideX10_2.tiff  
  inflating: /content/Kaggle/10x/10x/patho-patient10-polyp1-rectum-neoplastic-tubulovillous-slideX10_3.tiff  
  inflating: /content/Ka

In [None]:
# ===============================
# 📌 1. Gerekli Kütüphaneleri Yükleme
# ===============================
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import numpy as np
import pandas as pd
import os
import glob
import cv2
from transformers import ViTModel, ViTConfig, ViTFeatureExtractor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader


In [None]:
# ===============================
# 📌 2. Verileri Yükleme
# ===============================

# Google Colab için dosya yolları
ihc_data_path = "/content/Kaggle/ihc_data.xlsx"
patho_image_dir = "/content/Kaggle/10x/10x"

# Excel dosyasını oku
ihc_data = pd.ExcelFile(ihc_data_path)
ihc_df = ihc_data.parse("Sheet1")


In [None]:
# ===============================
# 📌 3. Hasta-Polip ID'yi Doğru Şekilde Oluştur (Hata Düzeltildi)
# ===============================

def create_patient_id(row):
    """
    Eğer 'polyp' değeri varsa 'patient-polyp' formatında döndür.
    Eğer 'polyp' boşsa, 'type' sütununun içeriğine göre 'normal' 'control' veya 'carcinom' olarak işaretle.
    """
    patient = str(row['patient']).strip() if pd.notna(row['patient']) else ""
    polyp = str(row['polyp']).strip() if pd.notna(row['polyp']) else ""
    type_value = str(row['type']).strip().lower() if pd.notna(row['type']) else ""

    if polyp:
        return f"{patient}-{polyp}"  # Hasta + Polip ID
    elif "normal" in type_value:
        return f"{patient}-normal"
    elif "control" in type_value:
        return f"{patient}-control"
    elif "carcinom" in type_value:
        return f"{patient}-carcinom"
    return None  # Eğer geçerli bir değer yoksa None döndür

# Yeni 'patient_id' sütununu oluştur
ihc_df["patient_id"] = ihc_df.apply(create_patient_id, axis=1)

# None (geçersiz) olan satırları sil
ihc_df = ihc_df.dropna(subset=["patient_id"])


# Patho görüntüleri için desteklenen tüm uzantılar
image_extensions = ["*.tiff", "*.tif", "*.jpg", "*.jpeg", "*.png"]

# Tüm dosya yollarını al
patho_images = []
for ext in image_extensions:
    patho_images.extend(glob.glob(os.path.join(patho_image_dir, "**", ext), recursive=True))

# Sonuçları yazdıralım
print(f"Toplam {len(patho_images)} görüntü bulundu.")
print("Örnek dosyalar: ", (patho_images[:5]) if len(patho_images) > 0 else "Görüntü bulunamadı!")


Toplam 400 görüntü bulundu.
Örnek dosyalar:  ['/content/Kaggle/10x/10x/patho-patient66-polyp1-descending-nonneoplastic-hamartomatosis-slideX10.tiff', '/content/Kaggle/10x/10x/patho-patient142-polyp1-walf-neoplastic-tubular-slideX10.tiff', '/content/Kaggle/10x/10x/patho-patient174-polyp2-colon-neoplastic-tubular-slideX10.tiff', '/content/Kaggle/10x/10x/patho-patient164-carcinom-sigmoid-carcinom-carcinom-slideX10.tiff', '/content/Kaggle/10x/10x/patho-patient5-polyp5-sigmoid-neoplastic-tubular-slideX10.tiff']


In [None]:
# Benzersiz type değerlerini ve kaç kez tekrar ettiklerini kontrol edelim
unique_types = ihc_df["polyp"].dropna().str.lower().str.strip().value_counts()

# Sonuçları yazdıralım
print("🔍 IHC Verisinde Bulunan Farklı 'type' Değerleri:")
print(unique_types)


🔍 IHC Verisinde Bulunan Farklı 'type' Değerleri:
polyp
polyp1      193
control1     71
polyp2       63
polyp3       26
carcinom     14
polyp4       13
polyp5        5
control2      5
polyp6        2
polyp7        1
polyp8        1
polyp9        1
polyp10       1
polyp11       1
polyp12       1
control3      1
Name: count, dtype: int64


In [None]:
# ===============================
# 📌 4. Patho Görüntülerinden Hasta ve Polip Bilgisini Çıkar
# ===============================

patho_data = []
for img_path in patho_images:
    filename = os.path.basename(img_path)
    parts = filename.split("-")

    if len(parts) >= 3:
        patient_id = str(parts[1]).strip()
        polyp_id = str(parts[2]).strip() if "polyp" in parts[2] else None
        status = None

        # Patho dosya adında açıkça "normal" veya "carcinom" geçiyorsa bunu belirle
        if "normal" in filename.lower():
            status = "normal"
        elif "carcinom" in filename.lower():
            status = "carcinom"
        elif "control" in filename.lower():
            status = "control"

        # 1️⃣ Polyp içeren vakalar için eşleşme kontrolü
        if polyp_id:
            patient_polyp = f"{patient_id}-{polyp_id}"
            if patient_polyp in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_polyp, "image_path": img_path})

        # 2️⃣ Normal ve Carcinom olanları IHC verisiyle eşleştirme
        elif status:
            patient_status = f"{patient_id}-{status}"
            if patient_status in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_status, "image_path": img_path})

        # 3️⃣ Eğer patho görüntüsünde "normal" yazmıyorsa ama IHC verisinde hasta "normal" olarak geçiyorsa yine ekleyelim
       # elif f"{patient_id}-normal" in ihc_df["patient_id"].values:
        #    patho_data.append({"patient_id": f"{patient_id}-normal", "image_path": img_path})

# Patho görüntülerini DataFrame'e çevir
patho_df = pd.DataFrame(patho_data)
print(len(patho_images))
len(patho_df)

400


316

In [None]:
print(len(patho_images))
len(patho_df)

400


316

In [None]:
patho_df

Unnamed: 0,patient_id,image_path
0,patient66-polyp1,/content/Kaggle/10x/10x/patho-patient66-polyp1...
1,patient142-polyp1,/content/Kaggle/10x/10x/patho-patient142-polyp...
2,patient174-polyp2,/content/Kaggle/10x/10x/patho-patient174-polyp...
3,patient164-carcinom,/content/Kaggle/10x/10x/patho-patient164-carci...
4,patient5-polyp5,/content/Kaggle/10x/10x/patho-patient5-polyp5-...
...,...,...
311,patient145-polyp1,/content/Kaggle/10x/10x/patho-patient145-polyp...
312,patient198-polyp1,/content/Kaggle/10x/10x/patho-patient198-polyp...
313,patient113-polyp2,/content/Kaggle/10x/10x/patho-patient113-polyp...
314,patient167-polyp1,/content/Kaggle/10x/10x/patho-patient167-polyp...


In [None]:



# ===============================
# 📌 4. Patho Görüntülerinden Hasta ve Polip Bilgisini Çıkar
# ===============================

patho_data = []
for img_path in patho_images:
    filename = os.path.basename(img_path)
    parts = filename.split("-")

    if len(parts) >= 3:
        patient_id = str(parts[1]).strip()
        polyp_id = str(parts[2]).strip() if "polyp" in parts[2] else None
        status = None

        # Patho dosya adında açıkça "normal" veya "carcinom" geçiyorsa bunu belirle
        if "normal" in filename.lower():
            status = "normal"
        elif "carcinom" in filename.lower():
            status = "carcinom"

        # 1️⃣ Polyp içeren vakalar için eşleşme kontrolü
        if polyp_id:
            patient_polyp = f"{patient_id}-{polyp_id}"
            if patient_polyp in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_polyp, "image_path": img_path})

        # 2️⃣ Normal ve Carcinom olanları IHC verisiyle eşleştirme
        elif status:
            patient_status = f"{patient_id}-{status}"
            if patient_status in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_status, "image_path": img_path})

        # 3️⃣ Eğer patho görüntüsünde "normal" yazmıyorsa ama IHC verisinde hasta "normal" olarak geçiyorsa yine ekleyelim
        elif f"{patient_id}-normal" in ihc_df["patient_id"].values:
            patho_data.append({"patient_id": f"{patient_id}-normal", "image_path": img_path})

# Patho görüntülerini DataFrame'e çevir
patho_df = pd.DataFrame(patho_data)

# ===============================
# 📌 5. Sadece EŞLEŞEN KAYITLARI ALMA
# ===============================

# Eşleşme yapmadan önce patient_id'lerin boşluklarını temizleyelim
ihc_df["patient_id"] = ihc_df["patient_id"].str.strip()
patho_df["patient_id"] = patho_df["patient_id"].str.strip()

# Merge işlemi: SADECE EŞLEŞENLER
merged_df = patho_df.merge(ihc_df, on="patient_id", how="inner")

# ===============================
# 📌 6. NaN Kalan Değerleri Temizleme
# ===============================

# NaN değerlerini kontrol et
print("🔍 NaN değerler kontrol ediliyor...")
print(merged_df.isnull().sum())

# Eğer NaN varsa, onları kaldır
merged_df = merged_df.dropna()


# ===============================
# 📌 7. CSV Olarak Kaydetme
# ===============================

csv_output_path = "/content/matched_patho_ihc_data.csv"
merged_df.to_csv(csv_output_path, index=False)

print(f"✅ Eşleştirilmiş veriler CSV olarak kaydedildi: {csv_output_path}")



🔍 NaN değerler kontrol ediliyor...
patient_id                       0
image_path                       0
ID                               0
colon                            0
patient                          0
polyp                            0
gender                           0
age                             56
location                         0
type                             0
subtype                          0
Ki-67(clone30-9)                 7
BRAF(cloneV600E)                 5
PD-L1epithelium(clone SP142)     5
PD-L1lymphocyte(clone SP142)     5
VEGF(clone SP125)                5
CD34(cloneQBend/10)              5
CD34(cloneQBend/10)skor          5
p53(clonebp53-11)                5
dtype: int64
✅ Eşleştirilmiş veriler CSV olarak kaydedildi: /content/matched_patho_ihc_data.csv


In [None]:
patho_data = []
for img_path in patho_images:
    filename = os.path.basename(img_path)
    parts = filename.split("-")

    if len(parts) >= 3:
        patient_id = str(parts[1]).strip()
        polyp_id = str(parts[2]).strip() if "polyp" in parts[2] else None
        status = None

        # Patho dosya adında açıkça "normal" veya "carcinom" geçiyorsa bunu belirle
        if "normal" in filename.lower():
            status = "normal"
        elif "carcinom" in filename.lower():
            status = "carcinom"
        elif "control" in filename.lower():
            status = "control"

        # 1️⃣ Polyp içeren vakalar için eşleşme kontrolü
        if polyp_id:
            patient_polyp = f"{patient_id}-{polyp_id}"
            if patient_polyp in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_polyp, "image_path": img_path})

        # 2️⃣ Normal ve Carcinom olanları IHC verisiyle eşleştirme
        elif status:
            patient_status = f"{patient_id}-{status}"
            if patient_status in ihc_df["patient_id"].values:
                patho_data.append({"patient_id": patient_status, "image_path": img_path})

        # 3️⃣ Eğer patho görüntüsünde "normal" yazmıyorsa ama IHC verisinde hasta "normal" olarak geçiyorsa yine ekleyelim
        elif f"{patient_id}-normal" in ihc_df["patient_id"].values:
            patho_data.append({"patient_id": f"{patient_id}-normal", "image_path": img_path})


In [None]:
parts

['patho', 'patient58', 'normal', 'normal', 'normal', 'normal', 'slideX10.tiff']

In [None]:
print(polyp_id)

None


In [None]:
patient_status

'patient58-normal'

In [None]:
 ihc_df["patient_id"].values

array(['patient1-polyp1', 'patient1-polyp2', 'patient1-polyp3',
       'patient2-polyp1', 'patient2-polyp2', 'patient3-polyp1',
       'patient4-polyp1', 'patient5-polyp1', 'patient5-polyp2',
       'patient5-polyp3', 'patient5-polyp4', 'patient5-polyp5',
       'patient6-polyp1', 'patient7-polyp1', 'patient8-polyp1',
       'patient9-polyp1', 'patient10-polyp1', 'patient11-polyp1',
       'patient12-polyp1', 'patient13-polyp1', 'patient13-polyp2',
       'patient13-polyp3', 'patient14-polyp1', 'patient15-polyp1',
       'patient15-polyp2', 'patient15-polyp3', 'patient16-polyp1',
       'patient17-polyp1', 'patient18-polyp1', 'patient19-carcinom',
       'patient20-polyp1', 'patient20-polyp2', 'patient20-polyp3',
       'patient20-polyp4', 'patient21-polyp1', 'patient22-polyp1',
       'patient22-polyp2', 'patient22-polyp3', 'patient22-polyp4',
       'patient23-polyp1', 'patient24-polyp1', 'patient25-polyp1',
       'patient26-polyp1', 'patient26-control1', 'patient27-polyp1',
       

In [None]:
polyp_id = str(parts[2]).strip() if "polyp" in parts[2] else None

In [None]:
patho_images

['/content/Kaggle/10x/10x/patho-patient188-polyp1-transverse-neoplastic-tubular-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient173-polyp2-splenicflexure-neoplastic-tubular-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient89-normal-ascending-normal-normal-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient167-carcinom-descending-carcinom-carcinom-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient113-polyp2-sigmoid-neoplastic-tubulovillous-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient46-polyp1-walf-nonneoplastic-hyperplastic-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient89-polyp9-descending-neoplastic-tubular-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient38-polyp2-rectum-neoplastic-tubular-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient203-carcinom-rectum-carcinom-carcinom-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient101-polyp1-sigmoid-neoplastic-tubular-slideX10.tiff',
 '/content/Kaggle/10x/10x/patho-patient20-polyp1-c

In [None]:
ihc_df

Unnamed: 0,ID,colon,patient,polyp,gender,age,location,type,subtype,Ki-67(clone30-9),BRAF(cloneV600E),PD-L1epithelium(clone SP142),PD-L1lymphocyte(clone SP142),VEGF(clone SP125),CD34(cloneQBend/10),CD34(cloneQBend/10)skor,p53(clonebp53-11),patient_id
0,1,colon,patient1,polyp1,M,53.0,descending,nonneoplastic,hyperplastic,50.0,negative,negative,negative,20.0,47.0,2.0,15.0,patient1-polyp1
1,1,colon,patient1,polyp2,M,53.0,descending,nonneoplastic,hyperplastic,40.0,negative,negative,negative,30.0,39.0,2.0,10.0,patient1-polyp2
2,1,colon,patient1,polyp3,M,53.0,descending,nonneoplastic,hyperplastic,50.0,negative,negative,negative,50.0,50.0,3.0,20.0,patient1-polyp3
3,2,colon,patient2,polyp1,M,49.0,ascending,nonneoplastic,hyperplastic,50.0,negative,negative,weak,40.0,63.0,3.0,5.0,patient2-polyp1
4,2,colon,patient2,polyp2,M,49.0,descending,neoplastic,villous,60.0,negative,negative,weak,70.0,57.0,2.0,25.0,patient2-polyp2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,204,colon,patient204,polyp1,M,,colon,neoplastic,tubular,70.0,negative,negative,negative,10.0,30.0,1.0,40.0,patient204-polyp1
403,205,colon,patient205,polyp1,M,62.0,descending,neoplastic,tubular,75.0,negative,negative,moderate,55.0,37.0,2.0,30.0,patient205-polyp1
404,205,colon,patient205,polyp2,F,,descending,neoplastic,tubulovillous,80.0,negative,negative,weak,60.0,40.0,2.0,35.0,patient205-polyp2
405,205,colon,patient205,polyp3,F,,transverse,neoplastic,tubular,70.0,negative,negative,moderate,50.0,36.0,2.0,35.0,patient205-polyp3


In [None]:
merged_df

Unnamed: 0,patient_id,image_path,ID,colon,patient,polyp,gender,age,location,type,subtype,Ki-67(clone30-9),BRAF(cloneV600E),PD-L1epithelium(clone SP142),PD-L1lymphocyte(clone SP142),VEGF(clone SP125),CD34(cloneQBend/10),CD34(cloneQBend/10)skor,p53(clonebp53-11)
0,patient188-polyp1,/content/Kaggle/10x/10x/patho-patient188-polyp...,188,colon,patient188,polyp1,F,65.0,transverse,neoplastic,tubular,80.0,negative,weak,moderate,20.0,29.0,1.0,30.0
3,patient46-polyp1,/content/Kaggle/10x/10x/patho-patient46-polyp1...,46,colon,patient46,polyp1,F,73.0,cecum,nonneoplastic,hyperplastic,20.0,negative,negative,negative,40.0,72.0,3.0,5.0
4,patient89-polyp9,/content/Kaggle/10x/10x/patho-patient89-polyp9...,89,colon,patient89,polyp9,F,56.0,descending,neoplastic,tubular,60.0,negative,negative,weak,50.0,40.0,2.0,50.0
5,patient38-polyp2,/content/Kaggle/10x/10x/patho-patient38-polyp2...,38,colon,patient38,polyp2,M,48.0,rectum,neoplastic,tubular,60.0,negative,weak,weak,0.0,30.0,1.0,30.0
6,patient203-carcinom,/content/Kaggle/10x/10x/patho-patient203-carci...,203,colon,patient203,carcinom,F,61.0,rectum,carcinom,carcinom,1.0,negative,negative,negative,10.0,42.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,patient7-polyp1,/content/Kaggle/10x/10x/patho-patient7-polyp1-...,7,colon,patient7,polyp1,M,89.0,descending,neoplastic,tubular,30.0,negative,negative,weak,30.0,49.0,2.0,15.0
311,patient90-polyp3,/content/Kaggle/10x/10x/patho-patient90-polyp3...,90,colon,patient90,polyp3,M,61.0,sigmoid,neoplastic,tubulovillous,50.0,negative,negative,weak,20.0,48.0,2.0,20.0
312,patient118-polyp1,/content/Kaggle/10x/10x/patho-patient118-polyp...,118,colon,patient118,polyp1,F,72.0,rectum,neoplastic,tubular,60.0,negative,negative,negative,30.0,55.0,2.0,15.0
314,patient151-polyp1,/content/Kaggle/10x/10x/patho-patient151-polyp...,151,colon,patient151,polyp1,F,78.0,sigmoid,neoplastic,tubulovillous,85.0,negative,negative,weak,50.0,56.0,2.0,40.0


In [None]:

# ===============================
# 📌 3. Görselleri İşleme (224x224)
# ===============================

# Görselleri dönüştürmek için ViT Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Veri Seti
class PathoDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, target_columns):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.target_columns = target_columns

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = row["image_path"]
        image = Image.open(image_path).convert("RGB")

        # Görseli işle
        inputs = self.feature_extractor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)

        # Hedef değişkenleri al
        targets = row[self.target_columns].values.astype(np.float32)

        return pixel_values, torch.tensor(targets)

# Hedef değişkenler (IHC biyobelirteçleri)
target_columns = ["Ki-67(clone30-9)", "VEGF(clone SP125)", "p53(clonebp53-11)"]
merged_df[target_columns] = merged_df[target_columns].astype(float)

# Eğitim & Test Ayrımı
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Veri setleri ve DataLoader
train_dataset = PathoDataset(train_df, feature_extractor, target_columns)
test_dataset = PathoDataset(test_df, feature_extractor, target_columns)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [None]:

# ===============================
# 📌 4. ViT Modelini Tanımlama
# ===============================

class ViTRegressor(nn.Module):
    def __init__(self, num_outputs):
        super(ViTRegressor, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.regressor = nn.Linear(self.vit.config.hidden_size, num_outputs)

    def forward(self, pixel_values):
        outputs = self.vit(pixel_values=pixel_values)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token'ı
        return self.regressor(cls_embedding)

# Modeli oluştur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTRegressor(num_outputs=len(target_columns)).to(device)


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:

# ===============================
# 📌 5. Modeli Eğitme
# ===============================

# Kayıp fonksiyonu ve optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

# Eğitim döngüsü
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for pixel_values, targets in train_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")


KeyboardInterrupt: 

In [None]:

# ===============================
# 📌 6. Modeli Test Etme
# ===============================

model.eval()
test_loss = 0

with torch.no_grad():
    for pixel_values, targets in test_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)
        outputs = model(pixel_values)
        loss = criterion(outputs, targets)
        test_loss += loss.item()

print(f"Test Loss: {test_loss/len(test_loader):.4f}")

# ===============================
# 📌 7. Modelin Tahmin Yapması
# ===============================

def predict(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        output = model(inputs["pixel_values"])
    return output.cpu().numpy()

# Örnek bir görsel için tahmin
sample_image = test_df["image_path"].iloc[0]
predictions = predict(sample_image)
print(f"Tahmin Edilen IHC Değerleri: {predictions}")



Epoch [1/10], Loss: 1761.3371
Epoch [2/10], Loss: 1740.3820
Epoch [3/10], Loss: 1678.7583
Epoch [4/10], Loss: 1660.6949
Epoch [5/10], Loss: 1644.4491
Epoch [6/10], Loss: 1575.7351
Epoch [7/10], Loss: 1598.3503
Epoch [8/10], Loss: 1512.4255
Epoch [9/10], Loss: 1458.1289
Epoch [10/10], Loss: 1395.1377
Test Loss: 1608.5999
Tahmin Edilen IHC Değerleri: [[8.292288  7.1114264 6.0691967]]


In [None]:
# ===============================
# 📌 1. Modeli Eğitme (Geliştirilmiş Versiyon)
# ===============================

# Daha küçük öğrenme oranı + L2 regularization
optimizer = optim.Adam(model.parameters(), lr=1e-7, weight_decay=1e-5)

# Daha Güçlü Loss Fonksiyonu (Huber Loss - SmoothL1Loss)
criterion = nn.SmoothL1Loss()

# Küçük Batch Size (4 → 2)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Modeli kaydetme fonksiyonu
model_save_path = "/content/vit_ihc_model_best.pth"

num_epochs = 30  # Daha fazla epoch (10 yerine 30)
best_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for pixel_values, targets in train_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    epoch_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

    # Eğer en iyi loss'u elde edersek modeli kaydet
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(model.state_dict(), model_save_path)
        print(f"✅ Yeni en iyi model kaydedildi: {model_save_path}")



Epoch [1/30], Loss: 27.7504
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [2/30], Loss: 27.7475
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [3/30], Loss: 27.7449
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [4/30], Loss: 27.7426
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [5/30], Loss: 27.7405
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [6/30], Loss: 27.7384
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [7/30], Loss: 27.7364
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [8/30], Loss: 27.7345
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [9/30], Loss: 27.7326
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth
Epoch [10/30], Loss: 27.7307
✅ Yeni en iyi model kaydedildi: /content/vit_ihc_model_best.pth


In [None]:

# ===============================
# 📌 2. Modelin Tahminleri Gerçek Değerlere Çekilsin!
# ===============================

# Modeli yükleme fonksiyonu
def load_model(model_path):
    model = ViTRegressor(num_outputs=len(target_columns)).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

# Modeli yükle
model = load_model(model_save_path)

def predict(image_path):
    """
    Tek bir patho görüntüsü için IHC biyobelirteçlerini tahmin et.
    """
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(inputs["pixel_values"])

    # Modelin tahminlerini tersine normalize et
    predictions = output.cpu().numpy().flatten()
    predictions = scaler.inverse_transform(predictions.reshape(1, -1))  # Özgün değerlere çek

    # NumPy formatına çevir ve sonucu dön
    return predictions

# Örnek bir görsel için tahmin
sample_image = test_df["image_path"].iloc[0]
predictions = predict(sample_image)

# Tahmini yazdır (Gerçek Ölçeğe Çekildi!)
predictions_dict = {target_columns[i]: predictions[0][i] for i in range(len(target_columns))}
print(f"Gerçek Ölçeğe Çevrilen Tahmin Edilen IHC Değerleri: {predictions_dict}")

In [None]:

# Excel dosyasını oku
ihc_data = pd.ExcelFile(ihc_data_path)
ihc_df = ihc_data.parse("Sheet1")

# Patient-Polyp ID oluştur (patient-polyp, patient-normal, patient-carcinom dahil)
ihc_df["patient_polyp"] = ihc_df["patient"] + "-" + ihc_df["polyp"].fillna("")
ihc_df["patient_normal"] = ihc_df["patient"] + "-normal"
ihc_df["patient_carcinom"] = ihc_df["patient"] + "-carcinom"

# Patho görüntülerini listele
patho_images = glob.glob(os.path.join(patho_image_dir, "*.tiff"))

# Görsel dosyalarından patient-polyp, patient-normal, patient-carcinom ID çıkar
patho_data = []
for img_path in patho_images:
    filename = os.path.basename(img_path)
    parts = filename.split("-")

    if len(parts) >= 3:
        patient_id = parts[1]
        polyp_id = parts[2] if "polyp" in parts[2] else None
        status = "normal" if "normal" in filename else "carcinom" if "carcinom" in filename else None

        if polyp_id:
            patient_polyp = patient_id + "-" + polyp_id
            patho_data.append({"patient_polyp": patient_polyp, "image_path": img_path})
        elif status:
            patient_status = patient_id + "-" + status
            patho_data.append({"patient_polyp": patient_status, "image_path": img_path})

patho_df = pd.DataFrame(patho_data)

# IHC verileriyle birleştirme
merged_df = ihc_df.merge(patho_df, on="patient_polyp", how="inner")

# ===============================
# 📌 3. NaN Temizleme ve Normalizasyon
# ===============================

# Hedef değişkenler (IHC biyobelirteçleri)
target_columns = ["Ki-67(clone30-9)", "VEGF(clone SP125)", "p53(clonebp53-11)"]

# NaN değerleri kontrol et
print("NaN Değerler:")
print(merged_df[target_columns].isnull().sum())

# NaN olan değerleri median ile doldur
merged_df[target_columns] = merged_df[target_columns].fillna(merged_df[target_columns].median())

# Min-Max Scaling ile 0-1 arasına getirme
scaler = MinMaxScaler()
merged_df[target_columns] = scaler.fit_transform(merged_df[target_columns])

# ===============================
# 📌 4. Görselleri İşleme (224x224)
# ===============================

# ViT Feature Extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Veri Seti
class PathoDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, target_columns):
        self.dataframe = dataframe
        self.feature_extractor = feature_extractor
        self.target_columns = target_columns

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = row["image_path"]
        image = Image.open(image_path).convert("RGB")

        # Görseli işle
        inputs = self.feature_extractor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].squeeze(0)

        # Hedef değişkenleri al
        targets = row[self.target_columns].values.astype(np.float32)

        return pixel_values, torch.tensor(targets)

# Eğitim & Test Ayrımı
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

# Veri setleri ve DataLoader
train_dataset = PathoDataset(train_df, feature_extractor, target_columns)
test_dataset = PathoDataset(test_df, feature_extractor, target_columns)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# ===============================
# 📌 5. ViT Modelini Tanımlama
# ===============================

class ViTRegressor(nn.Module):
    def __init__(self, num_outputs):
        super(ViTRegressor, self).__init__()
        self.vit = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
        self.regressor = nn.Linear(self.vit.config.hidden_size, num_outputs)

    def forward(self, pixel_values):
        outputs = self.vit(pixel_values=pixel_values)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token'ı
        return self.regressor(cls_embedding)

# Modeli oluştur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTRegressor(num_outputs=len(target_columns)).to(device)

# ===============================
# 📌 6. Modeli Eğitme
# ===============================

# Küçük Learning Rate Kullan
optimizer = optim.AdamW(model.parameters(), lr=1e-5)  # Daha düşük learning rate

# Kayıp fonksiyonu
criterion = nn.MSELoss()

# Eğitim döngüsü
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for pixel_values, targets in train_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

# ===============================
# 📌 7. Modeli Test Etme
# ===============================

model.eval()
test_loss = 0

with torch.no_grad():
    for pixel_values, targets in test_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)
        outputs = model(pixel_values)
        loss = criterion(outputs, targets)
        test_loss += loss.item()

print(f"Test Loss: {test_loss/len(test_loader):.4f}")

# ===============================
# 📌 8. Modelin Tahmin Yapması
# ===============================

def predict(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        output = model(inputs["pixel_values"])
    return output.cpu().numpy()

# Örnek bir görsel için tahmin
sample_image = test_df["image_path"].iloc[0]
predictions = predict(sample_image)
print(f"Tahmin Edilen IHC Değerleri: {predictions}")


NaN Değerler:
Ki-67(clone30-9)     7
VEGF(clone SP125)    5
p53(clonebp53-11)    5
dtype: int64


NameError: name 'MinMaxScaler' is not defined

In [None]:
import os
import glob

# 📌 Dizin yolları
patho_image_dir = "/content/Kaggle/10x/10x"

# 📌 Desteklenen tüm görüntü uzantıları
extensions = ['*.tiff', '*.jpg', '*.jpeg', '*.png']

# 📌 Kaç tane dosya olduğunu say
patho_images = []

for ext in extensions:
    patho_images.extend(glob.glob(os.path.join(patho_image_dir, '**', ext), recursive=True))

print(f"Patoloji görüntüleri: {len(patho_images)} adet")




Patoloji görüntüleri: 400 adet


In [None]:
df_patho = pd.DataFrame({"patho_image": [os.path.basename(img) for img in patho_images]})
print(df_patho.head())  # İlk birkaç satırı göster
print(df_patho.columns)  # Sütun isimlerini göster


                                         patho_image
0  patho-patient45-polyp1-sigmoid-neoplastic-tubu...
1  patho-patient37-normal-ascending-normal-normal...
2  patho-patient164-carcinom-sigmoid-carcinom-car...
3  patho-patient125-polyp1-transverse-nonneoplast...
4  patho-patient27-polyp1-transverse-nonneoplasti...
Index(['patho_image'], dtype='object')


In [None]:
import pandas as pd

# 📌 Kolon ve patoloji görüntülerini topla
patho_images = []

for ext in extensions:
    patho_images.extend(glob.glob(os.path.join(patho_image_dir, '**', ext), recursive=True))

# 📌 Sadece dosya adlarını içeren DataFrame oluştur
df_patho = pd.DataFrame({"patho_image": [os.path.basename(img) for img in patho_images]})

# 📌 Kaggle'dan IHC Excel verisini yükle
ihc_data = pd.read_excel('/content/Kaggle/ihc_data.xlsx')

# 📌 Patient-Polyp ve Patient-Normal Kimliklerini Çıkart
def extract_patient_id(filename):
    parts = filename.split("-")
    if "normal" in parts:
        return parts[1] + "-normal"
    else:
        return "-".join(parts[1:3])  # "patientX-polypY" formatında

df_patho['patient_id'] = df_patho['patho_image'].apply(extract_patient_id)

# 📌 IHC dosyasında patient_polyp_id veya normal ID oluştur
ihc_data['patient_id'] = ihc_data.apply(lambda row: f"{row['patient']}-normal" if pd.isna(row['polyp']) else f"{row['patient']}-{row['polyp']}", axis=1)

# 📌 Patoloji görüntüleriyle IHC verilerini birleştir
merged_patho = pd.merge(df_patho, ihc_data, on="patient_id", how="left")

# 📌 Sonuçları göster
print("\nPatoloji Görselleri ve IHC Verileri:")
print(merged_patho.head())


Patoloji Görselleri ve IHC Verileri:
                                         patho_image           patient_id  \
0  patho-patient45-polyp1-sigmoid-neoplastic-tubu...     patient45-polyp1   
1  patho-patient37-normal-ascending-normal-normal...     patient37-normal   
2  patho-patient164-carcinom-sigmoid-carcinom-car...  patient164-carcinom   
3  patho-patient125-polyp1-transverse-nonneoplast...    patient125-polyp1   
4  patho-patient27-polyp1-transverse-nonneoplasti...     patient27-polyp1   

      ID  colon     patient     polyp gender   age    location           type  \
0   45.0  colon   patient45    polyp1      F  63.0     sigmoid     neoplastic   
1    NaN    NaN         NaN       NaN    NaN   NaN         NaN            NaN   
2  164.0  colon  patient164  carcinom      M  83.0      rectum       carcinom   
3  125.0  colon  patient125    polyp1      F  63.0  transverse  nonneoplastic   
4   27.0  colon   patient27    polyp1      M  63.0  transverse  nonneoplastic   

        subt

In [None]:
df_patho


Unnamed: 0,patho_image,patient_id
0,patho-patient45-polyp1-sigmoid-neoplastic-tubu...,patient45-polyp1
1,patho-patient37-normal-ascending-normal-normal...,patient37-normal
2,patho-patient164-carcinom-sigmoid-carcinom-car...,patient164-carcinom
3,patho-patient125-polyp1-transverse-nonneoplast...,patient125-polyp1
4,patho-patient27-polyp1-transverse-nonneoplasti...,patient27-polyp1
...,...,...
395,patho-patient189-polyp3-descending-nonneoplast...,patient189-polyp3
396,patho-patient35-polyp2-descending-nonneoplasti...,patient35-polyp2
397,patho-patient5-polyp2-ascending-nonneoplastic-...,patient5-polyp2
398,patho-patient178-polyp1-hepaticflexura-neoplas...,patient178-polyp1


In [None]:
import pandas as pd

# Öncelikle her iki DataFrame'i yükleyelim.

# df_patho DataFrame'ini yüklemeye yönelik kod parçanız.
# İçeriği düzenleyip, daha kapsamlı bir patient_id çıkarma işlevi ekleyelim.
def extract_patient_id(filename):
    parts = filename.split("-")[1:]  # "patho" kısmını atlayıp, geri kalanı al
    if "normal" in parts:
        return f"{parts[0]}-normal"
    elif "carcinom" in parts:
        return f"{parts[0]}-carcinom"
    else:
        # Genel durum: "patientX-polypY"
        return "-".join(parts[:2])  # Sadece ilk iki parçayı al


df_patho['patient_id'] = df_patho['patho_image'].apply(extract_patient_id)

# Kaggle'dan IHC Excel verisini yükleyelim
ihc_data = pd.read_excel('/content/Kaggle/ihc_data.xlsx')

# IHC verilerinde patient_polyp_id veya normal ID oluşturalım
ihc_data['patient_id'] = ihc_data.apply(
    lambda row: f"{row['patient']}-normal" if pd.isna(row['polyp']) else f"{row['patient']}-{row['polyp']}", axis=1)

# df_patho ve ihc_data DataFrame'lerini 'patient_id' üzerinden birleştirelim
merged_df = pd.merge(df_patho, ihc_data, on="patient_id", how="left")

# Sonuçların birkaçını gösterelim
print(merged_df.head())


                                         patho_image           patient_id  \
0  patho-patient45-polyp1-sigmoid-neoplastic-tubu...     patient45-polyp1   
1  patho-patient37-normal-ascending-normal-normal...     patient37-normal   
2  patho-patient164-carcinom-sigmoid-carcinom-car...  patient164-carcinom   
3  patho-patient125-polyp1-transverse-nonneoplast...    patient125-polyp1   
4  patho-patient27-polyp1-transverse-nonneoplasti...     patient27-polyp1   

      ID  colon     patient     polyp gender   age    location           type  \
0   45.0  colon   patient45    polyp1      F  63.0     sigmoid     neoplastic   
1    NaN    NaN         NaN       NaN    NaN   NaN         NaN            NaN   
2  164.0  colon  patient164  carcinom      M  83.0      rectum       carcinom   
3  125.0  colon  patient125    polyp1      F  63.0  transverse  nonneoplastic   
4   27.0  colon   patient27    polyp1      M  63.0  transverse  nonneoplastic   

        subtype  Ki-67(clone30-9) BRAF(cloneV600E)

In [None]:
# patho_images_df için 'patient_id' oluşturma
def extract_patient_id_from_patho_image(filename):
    parts = filename.split("-")
    # Hastanın ve polip bilgisinin çıkarılması
    if "normal" in parts:
        patient_id = f"{parts[1]}-normal"
    else:
        patient_part = parts[1]
        polyp_part = parts[2] if "polyp" in parts[2] else "unknown"
        patient_id = f"{patient_part}-{polyp_part}"
    return patient_id

df_patho['patient_id'] = df_patho['patho_image'].apply(extract_patient_id_from_patho_image)

# ihc_data_df için 'patient_id' oluşturma, birden fazla polip durumunu ele alacak şekilde
ihc_data['patient_id'] = ihc_data.apply(
    lambda row: f"{row['patient']}-{row['polyp']}" if pd.notna(row['polyp']) else f"{row['patient']}-normal",
    axis=1
)

# Yeniden birleştirme işlemi
revised_merged_df = pd.merge(df_patho, ihc_data, on="patient_id", how="inner")

# Yeniden birleştirilmiş DataFrame boyutu ve içeriği
revised_merged_length = len(revised_merged_df)
revised_merged_df.head(), revised_merged_length


(                                         patho_image         patient_id   ID  \
 0  patho-patient45-polyp1-sigmoid-neoplastic-tubu...   patient45-polyp1   45   
 1  patho-patient125-polyp1-transverse-nonneoplast...  patient125-polyp1  125   
 2  patho-patient27-polyp1-transverse-nonneoplasti...   patient27-polyp1   27   
 3  patho-patient90-polyp2-descending-neoplastic-t...   patient90-polyp2   90   
 4  patho-patient156-polyp1-sigmoid-neoplastic-tub...  patient156-polyp1  156   
 
    colon     patient   polyp gender   age    location           type  \
 0  colon   patient45  polyp1      F  63.0     sigmoid     neoplastic   
 1  colon  patient125  polyp1      F  63.0  transverse  nonneoplastic   
 2  colon   patient27  polyp1      M  63.0  transverse  nonneoplastic   
 3  colon   patient90  polyp2      M  61.0  descending     neoplastic   
 4  colon  patient156  polyp1      M  50.0  descending     neoplastic   
 
         subtype  Ki-67(clone30-9) BRAF(cloneV600E)  \
 0       tubular 

In [None]:
from IPython.display import display
print("\nPatoloji Görselleri ve IHC Verileri:")
display(merged_df)

# Eksik değerlerin sayısını ve oranını hesaplayalım
missing_values = merged_df.isnull().sum()
missing_values_percentage = (missing_values / len(merged_df)) * 100

# Eksik değerler hakkında özet bir tablo oluşturma
missing_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})

# Sadece eksik değeri olan sütunları filtreleme
missing_summary = missing_summary[missing_summary['Missing Values'] > 0]

print(missing_summary)


Patoloji Görselleri ve IHC Verileri:


Unnamed: 0,patho_image,patient_id,ID,colon,patient,polyp,gender,age,location,type,subtype,Ki-67(clone30-9),BRAF(cloneV600E),PD-L1epithelium(clone SP142),PD-L1lymphocyte(clone SP142),VEGF(clone SP125),CD34(cloneQBend/10),CD34(cloneQBend/10)skor,p53(clonebp53-11)
0,patho-patient45-polyp1-sigmoid-neoplastic-tubu...,patient45-polyp1,45.0,colon,patient45,polyp1,F,63.0,sigmoid,neoplastic,tubular,30.0,negative,negative,weak,40.0,30.0,1.0,15.0
1,patho-patient37-normal-ascending-normal-normal...,patient37-normal,,,,,,,,,,,,,,,,,
2,patho-patient164-carcinom-sigmoid-carcinom-car...,patient164-carcinom,164.0,colon,patient164,carcinom,M,83.0,rectum,carcinom,carcinom,90.0,weak,negative,weak,90.0,64.0,3.0,95.0
3,patho-patient125-polyp1-transverse-nonneoplast...,patient125-polyp1,125.0,colon,patient125,polyp1,F,63.0,transverse,nonneoplastic,inflamatuar,40.0,negative,weak,weak,10.0,55.0,2.0,15.0
4,patho-patient27-polyp1-transverse-nonneoplasti...,patient27-polyp1,27.0,colon,patient27,polyp1,M,63.0,transverse,nonneoplastic,hyperplastic,35.0,negative,negative,weak,40.0,34.0,2.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,patho-patient189-polyp3-descending-nonneoplast...,patient189-polyp3,189.0,colon,patient189,polyp3,M,,descending,nonneoplastic,hyperplastic,40.0,negative,weak,weak,10.0,46.0,2.0,10.0
396,patho-patient35-polyp2-descending-nonneoplasti...,patient35-polyp2,35.0,colon,patient35,polyp2,F,62.0,descending,nonneoplastic,hyperplastic,40.0,negative,negative,weak,20.0,44.0,2.0,25.0
397,patho-patient5-polyp2-ascending-nonneoplastic-...,patient5-polyp2,5.0,colon,patient5,polyp2,F,65.0,ascending,nonneoplastic,hyperplastic,10.0,negative,negative,negative,0.0,34.0,1.0,5.0
398,patho-patient178-polyp1-hepaticflexura-neoplas...,patient178-polyp1,178.0,colon,patient178,polyp1,M,66.0,transverse,neoplastic,tubulovillous,70.0,negative,negative,weak,50.0,53.0,2.0,30.0


                              Missing Values  Percentage
ID                                        87       21.75
colon                                     87       21.75
patient                                   87       21.75
polyp                                     87       21.75
gender                                    87       21.75
age                                      143       35.75
location                                  87       21.75
type                                      87       21.75
subtype                                   87       21.75
Ki-67(clone30-9)                          94       23.50
BRAF(cloneV600E)                          92       23.00
PD-L1epithelium(clone SP142)              92       23.00
PD-L1lymphocyte(clone SP142)              92       23.00
VEGF(clone SP125)                         92       23.00
CD34(cloneQBend/10)                       92       23.00
CD34(cloneQBend/10)skor                   92       23.00
p53(clonebp53-11)              

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Eksik değerleri tahmin etmek için type ve subtype değişkenlerini dummy değişkenlere çevirelim
merged_df_dummies = pd.get_dummies(merged_patho[['type']], drop_first=True)

# Eksik değerlerin olduğu sütunları belirleyelim
markers = [
    'Ki-67(clone30-9)', 'BRAF(cloneV600E)',
    'PD-L1epithelium(clone SP142)', 'PD-L1lymphocyte(clone SP142)',
    'VEGF(clone SP125)', 'CD34(cloneQBend/10)',
    'CD34(cloneQBend/10)skor', 'p53(clonebp53-11)']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

merged_df=merged_patho
# Veri çerçevesini dummy değişkenlerle genişletme
df_dummies = pd.get_dummies(merged_df[['type']], drop_first=True)

# Korelasyon analizi (Pearson)
for marker in markers:
    # Sadece sayısal marker değerlerini kullanıyoruz
    if merged_df[marker].dtype in ['float64', 'int64']:
        for dummy_col in df_dummies.columns:
            try:
                correlation, p_value = pearsonr(merged_df[marker].dropna(), df_dummies[dummy_col].loc[merged_df[marker].dropna().index])
                print(f"{marker} ve {dummy_col} için Pearson korelasyon katsayısı: {correlation:.3f}, p-değeri: {p_value:.3f}")

                # Scatter plot ve trendline
                valid_indices = merged_df[marker].dropna().index.intersection(df_dummies[dummy_col].dropna().index)
                sns.lmplot(x=marker, y=dummy_col, data=merged_df.loc[valid_indices], aspect=2)
                plt.title(f"{marker} ile {dummy_col} arasındaki ilişki")
                plt.show()
            except KeyError:
                print(f"'{dummy_col}' dummy değişkeni mevcut değil, atlanıyor.")

Ki-67(clone30-9) ve type_neoplastic için Pearson korelasyon katsayısı: 0.419, p-değeri: 0.000
'type_neoplastic' dummy değişkeni mevcut değil, atlanıyor.
Ki-67(clone30-9) ve type_nonneoplastic için Pearson korelasyon katsayısı: -0.568, p-değeri: 0.000
'type_nonneoplastic' dummy değişkeni mevcut değil, atlanıyor.
Ki-67(clone30-9) ve type_normal için Pearson korelasyon katsayısı: -0.124, p-değeri: 0.031
'type_normal' dummy değişkeni mevcut değil, atlanıyor.
VEGF(clone SP125) ve type_neoplastic için Pearson korelasyon katsayısı: 0.154, p-değeri: 0.007
'type_neoplastic' dummy değişkeni mevcut değil, atlanıyor.
VEGF(clone SP125) ve type_nonneoplastic için Pearson korelasyon katsayısı: -0.237, p-değeri: 0.000
'type_nonneoplastic' dummy değişkeni mevcut değil, atlanıyor.
VEGF(clone SP125) ve type_normal için Pearson korelasyon katsayısı: -0.109, p-değeri: 0.056
'type_normal' dummy değişkeni mevcut değil, atlanıyor.
CD34(cloneQBend/10) ve type_neoplastic için Pearson korelasyon katsayısı: -0.03

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Sayısal ve kategorik markerları belirleme
numeric_markers = ['Ki-67(clone30-9)', 'VEGF(clone SP125)', 'CD34(cloneQBend/10)', 'CD34(cloneQBend/10)skor', 'p53(clonebp53-11)']
categorical_markers = ['BRAF(cloneV600E)', 'PD-L1epithelium(clone SP142)', 'PD-L1lymphocyte(clone SP142)']

# Tüm mümkün 'type' kategorileri için dummy değişkenler oluşturma
all_types_dummies = pd.get_dummies(merged_df['type'], drop_first=True)

for marker in numeric_markers:
    non_missing_data = merged_df.dropna(subset=[marker])
    # Eğitim verisindeki 'type' kategorileri için dummies oluştur
    X = all_types_dummies.loc[non_missing_data.index]
    y = non_missing_data[marker]

    model = LinearRegression()
    model.fit(X, y)

    # Eksik verileri tahmin etme
    missing_data = merged_df[merged_df[marker].isnull()]
    if not missing_data.empty:
        X_missing = all_types_dummies.loc[missing_data.index]
        merged_df.loc[merged_df[marker].isnull(), marker] = model.predict(X_missing)

# Kategorik veriler için eksik veri doldurma (Mod)
for marker in categorical_markers:
    most_frequent_value = merged_df[marker].mode()[0]
    merged_df[marker] = merged_df[marker].fillna(most_frequent_value)

# Eksik veri kontrolü
missing_values_after = merged_df.isnull().sum()

# Sadece eksik değeri olan sütunları gösterme
missing_summary_after = missing_values_after[missing_values_after > 0]

if missing_summary_after.empty:
    print("Tüm eksik veriler başarıyla doldurulmuştur.")
else:
    print("Aşağıdaki sütunlarda hala eksik veriler mevcut:")
    print(missing_summary_after)


Aşağıdaki sütunlarda hala eksik veriler mevcut:
ID           87
colon        87
patient      87
polyp        87
gender       87
age         143
location     87
type         87
subtype      87
dtype: int64


In [None]:
from transformers import ViTForImageClassification
from transformers import ViTFeatureExtractor
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os

In [None]:
class PathologyImagesDataset(torch.utils.data.Dataset):
    def __init__(self, img_paths, labels, feature_extractor):
        self.img_paths = img_paths
        self.labels = labels
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert("RGB")
        image = self.feature_extractor(images=image, return_tensors="pt").pixel_values.squeeze()
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return image, label

# Özellik çıkarıcı yükleme
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

# Veri yükleyicilerinizi (dataloaders) tanımlayın
# Bu örnekte, 'img_paths' ve 'labels' uygun şekilde tanımlanmalıdır.
train_dataset = PathologyImagesDataset(img_paths=train_img_paths, labels=train_labels, feature_extractor=feature_extractor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



NameError: name 'train_img_paths' is not defined

In [None]:
# ViT modelini özelleştirilmiş sınıf sayısıyla yükleyin
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224',
                                                  num_labels=len(unique_ihc_labels))

# Eğitim döngüsü - Bu örnek, eğitim sürecini basitleştirir
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()  # Çoklu etiket görevi için uygun kayıp fonksiyonu

def train(model, data_loader, loss_fn, optimizer):
    model.train()
    for images, labels in data_loader:
        outputs = model(images).logits
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Loss: {loss.item()}")

# Modeli Eğitin
train(model, train_loader, loss_fn, optimizer)


NameError: name 'unique_ihc_labels' is not defined

In [None]:
import glob
import os

# Dizin yolu
patho_image_dir = "/content/Kaggle/10x/10x"

# Desteklenen görüntü uzantıları
extensions = ['*.tiff', '*.jpg', '*.jpeg', '*.png']

# Görüntü dosyalarını toplama
patho_images = []
for ext in extensions:
    patho_images.extend(glob.glob(os.path.join(patho_image_dir, '**', ext), recursive=True))

print(f"Toplam patoloji görüntü sayısı: {len(patho_images)}")


Toplam patoloji görüntü sayısı: 400


In [None]:
import pandas as pd

# IHC veri dosyasını yükleme
ihc_data_df = pd.read_excel('/content/Kaggle/ihc_data.xlsx')

# İlk birkaç satırı göstererek veri yapısnı kontrol et
print(ihc_data_df.head())


   ID  colon   patient   polyp gender   age    location           type  \
0   1  colon  patient1  polyp1      M  53.0  descending  nonneoplastic   
1   1  colon  patient1  polyp2      M  53.0  descending  nonneoplastic   
2   1  colon  patient1  polyp3      M  53.0  descending  nonneoplastic   
3   2  colon  patient2  polyp1      M  49.0   ascending  nonneoplastic   
4   2  colon  patient2  polyp2      M  49.0  descending     neoplastic   

        subtype  Ki-67(clone30-9) BRAF(cloneV600E)  \
0  hyperplastic              50.0         negative   
1  hyperplastic              40.0         negative   
2  hyperplastic              50.0         negative   
3  hyperplastic              50.0         negative   
4       villous              60.0         negative   

  PD-L1epithelium(clone SP142) PD-L1lymphocyte(clone SP142)  \
0                     negative                     negative   
1                     negative                     negative   
2                     negative           

In [None]:
# Görüntü isimlerinden gerekli bilgileri ayıklama
def extract_info_from_filename(filename):
    parts = filename.split('-')
    patient_polyp = '-'.join(parts[1:3])  # Örneğin 'patient1-polyp1'
    return patient_polyp

# Dosya yollarından bilgi çıkarma
patho_info = [extract_info_from_filename(os.path.basename(path)) for path in patho_images]

# IHC verilerini 'patient_polyp' sütunu ile genişletme
ihc_data_df['patient_polyp'] = ihc_data_df['patient'].astype(str) + '-' + ihc_data_df['polyp'].astype(str)

# Verileri eşleştirme
merged_df = pd.DataFrame(patho_images, columns=['image_path'])
merged_df['patient_polyp'] = [extract_info_from_filename(os.path.basename(path)) for path in patho_images]
merged_df = merged_df.merge(ihc_data_df, on='patient_polyp', how='inner')

print("Eşleştirilmiş veri setinin boyutu:", merged_df.shape)
print(merged_df.head())


Eşleştirilmiş veri setinin boyutu: (316, 19)
                                          image_path        patient_polyp  \
0  /content/Kaggle/10x/10x/patho-patient45-polyp1...     patient45-polyp1   
1  /content/Kaggle/10x/10x/patho-patient164-carci...  patient164-carcinom   
2  /content/Kaggle/10x/10x/patho-patient125-polyp...    patient125-polyp1   
3  /content/Kaggle/10x/10x/patho-patient27-polyp1...     patient27-polyp1   
4  /content/Kaggle/10x/10x/patho-patient90-polyp2...     patient90-polyp2   

    ID  colon     patient     polyp gender   age    location           type  \
0   45  colon   patient45    polyp1      F  63.0     sigmoid     neoplastic   
1  164  colon  patient164  carcinom      M  83.0      rectum       carcinom   
2  125  colon  patient125    polyp1      F  63.0  transverse  nonneoplastic   
3   27  colon   patient27    polyp1      M  63.0  transverse  nonneoplastic   
4   90  colon   patient90    polyp2      M  61.0  descending     neoplastic   

        subtype  

In [None]:
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch

class PathologyImagesDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Görüntü yolu
        img_path = self.dataframe.iloc[idx]['image_path']
        image = Image.open(img_path).convert('RGB')

        # Eğer bir dönüşüm varsa uygula
        if self.transform:
            image = self.transform(image)

        # Etiketler (IHC test sonuçları)
        labels = self.dataframe.iloc[idx][3:].values.astype(float)  # İlk 3 sütun dışındaki tüm sütunları al
        labels = torch.tensor(labels, dtype=torch.float32)

        return image, labels

# Görüntüleri dönüştürmek için transformasyonlar
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Veri seti ve veri yükleyici oluşturma
dataset = PathologyImagesDataset(merged_df, transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
from transformers import ViTForImageClassification
import torch.optim as optim

# Modeli yükleme ve ayarlama
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=merged_df.shape[1] - 3)

# Optimizer ve kayıp fonksiyonu
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.BCEWithLogitsLoss()

def train_model(model, dataloader, optimizer, loss_fn, epochs=10):
    model.train()
    for epoch in range(epochs):
        for images, labels in dataloader:
            outputs = model(images).logits
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Model eğitimi
train_model(model, dataloader, optimizer, loss_fn)


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

RuntimeError: Error(s) in loading state_dict for ViTForImageClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([1000, 768]) from checkpoint, the shape in current model is torch.Size([16, 768]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([1000]) from checkpoint, the shape in current model is torch.Size([16]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.