<a href="https://colab.research.google.com/github/bsenursahin/UYB455YAPAYZEKA/blob/main/BERTFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#K√ºt√ºphaneleri kuruyoruz.

!pip install transformers accelerate datasets scikit-learn openpyxl -q

In [None]:
#BERT MODELƒ∞

import os
import gc
import warnings
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset

# Uyarƒ±larƒ± kapat
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"


torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

class Config:
    """Training Configuration - A100 OPTIMIZED & EXCEL FIX"""

    FILE_PATH = "/content/temiz_veri_seti.xlsx" # Excel dosyan

    MODEL_NAME = "dbmdz/bert-base-turkish-cased"
    OUTPUT_DIR = "./turkish_sentiment_model"

    TEXT_COLUMN = "Yorum"
    LABEL_COLUMN = "Duygu"

    # --- A100 PERFORMANS AYARLARI ---
    MAX_LENGTH = 128
    TRAIN_BATCH_SIZE = 256
    EVAL_BATCH_SIZE = 512
    NUM_EPOCHS = 3
    LEARNING_RATE = 2e-5
    WARMUP_RATIO = 0.1
    WEIGHT_DECAY = 0.01
    TEST_SIZE = 0.10
    VAL_SIZE = 0.10
    RANDOM_STATE = 42
    GRADIENT_ACCUMULATION_STEPS = 1
    FP16 = True
    GRADIENT_CHECKPOINTING = False
    MAX_SAMPLES = None
    NUM_WORKERS = 8


def load_and_prepare_data(config):
    """Load and preprocess dataset (Excel & CSV Support)"""

    print("Data Loading")
    print("-" * 70)

    try:
        print(f"\nLoading: {config.FILE_PATH}")

        # --- D√úZELTME BURADA: Dosya uzantƒ±sƒ±na g√∂re okuma ---
        if config.FILE_PATH.endswith('.xlsx') or config.FILE_PATH.endswith('.xls'):
            print("Excel formatƒ± algƒ±landƒ±, pd.read_excel kullanƒ±lƒ±yor...")
            df = pd.read_excel(config.FILE_PATH)
        else:
            # CSV okuma denemeleri (eskisi gibi)
            print("CSV formatƒ± varsayƒ±lƒ±yor, pd.read_csv kullanƒ±lƒ±yor...")
            try:
                df = pd.read_csv(config.FILE_PATH, sep=';', on_bad_lines='skip', encoding='utf-8')
            except:
                try:
                    df = pd.read_csv(config.FILE_PATH, sep=',', on_bad_lines='skip', encoding='utf-8')
                except:
                    df = pd.read_csv(config.FILE_PATH, sep=';', on_bad_lines='skip', encoding='windows-1254')

        # S√ºtun isim temizliƒüi
        df.columns = df.columns.str.replace('√Ø¬ª¬ø', '', regex=False).str.strip()

        # Bo≈ülarƒ± temizle
        df = df.drop_duplicates(subset=[config.TEXT_COLUMN])
        df = df.dropna(subset=[config.TEXT_COLUMN, config.LABEL_COLUMN])

        print(f"Total Rows: {len(df):,}")

        # √ñrneklem (opsiyonel)
        if config.MAX_SAMPLES and len(df) > config.MAX_SAMPLES:
            df = df.sample(n=config.MAX_SAMPLES, random_state=config.RANDOM_STATE)

        print("\nMapping labels...")
        # Eƒüer etiketler zaten 0,1,2 ise dokunmaz, string ise √ßevirir
        label_map = {
            'Olumsuz': 0, 'Negative': 0, 'negative': 0,
            'Olumlu': 1, 'Positive': 1, 'positive': 1,
            'N√∂tr': 2, 'Neutral': 2, 'neutral': 2, 'Notr': 2
        }

        # Etiket s√ºtunu string ise map'le, deƒüilse olduƒüu gibi bƒ±rak (integer kontrol√º)
        if df[config.LABEL_COLUMN].dtype == 'object':
             df['label'] = df[config.LABEL_COLUMN].astype(str).str.strip().map(label_map)
        else:
             df['label'] = df[config.LABEL_COLUMN]

        # Temizlik
        df = df.dropna(subset=['label'])
        df['label'] = df['label'].astype(int)

        print("Class Distribution:")
        print(df['label'].value_counts().sort_index())

        return df

    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {config.FILE_PATH}")
    except Exception as e:
        raise Exception(f"Error loading data: {str(e)}")


def split_data(df, config):
    print("\nData Splitting")
    print("-" * 70)
    train_val, test = train_test_split(
        df, test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE, stratify=df['label']
    )
    val_size_adjusted = config.VAL_SIZE / (1 - config.TEST_SIZE)
    train, val = train_test_split(
        train_val, test_size=val_size_adjusted,
        random_state=config.RANDOM_STATE, stratify=train_val['label']
    )
    print(f"Train: {len(train):,}")
    print(f"Val:   {len(val):,}")
    print(f"Test:  {len(test):,}")
    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)


def create_datasets(train_df, val_df, test_df, tokenizer, config):
    print("\nTokenization")
    print("-" * 70)
    def tokenize_function(examples):
        return tokenizer(
            examples[config.TEXT_COLUMN],
            padding="max_length",
            truncation=True,
            max_length=config.MAX_LENGTH
        )
    train_dataset = Dataset.from_pandas(train_df[[config.TEXT_COLUMN, 'label']])
    val_dataset = Dataset.from_pandas(val_df[[config.TEXT_COLUMN, 'label']])
    test_dataset = Dataset.from_pandas(test_df[[config.TEXT_COLUMN, 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=5000)
    val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=5000)
    test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=5000)

    train_dataset = train_dataset.remove_columns([config.TEXT_COLUMN])
    val_dataset = val_dataset.remove_columns([config.TEXT_COLUMN])
    test_dataset = test_dataset.remove_columns([config.TEXT_COLUMN])

    train_dataset.set_format("torch")
    val_dataset.set_format("torch")
    test_dataset.set_format("torch")
    gc.collect()
    torch.cuda.empty_cache()
    return train_dataset, val_dataset, test_dataset


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average='macro'),
        "precision_macro": precision_score(labels, predictions, average='macro'),
        "recall_macro": recall_score(labels, predictions, average='macro')
    }


def train_model(config):
    print("Turkish BERT Sentiment Analysis Training (A100 Optimized)")
    print("-" * 70)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")

    df = load_and_prepare_data(config)
    train_df, val_df, test_df = split_data(df, config)
    del df
    gc.collect()

    print("\nModel Loading")
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME, num_labels=3,
        id2label={0: "Negative", 1: "Positive", 2: "Neutral"},
        label2id={"Negative": 0, "Positive": 1, "Neutral": 2}
    )

    if config.GRADIENT_CHECKPOINTING:
        model.gradient_checkpointing_enable()

    train_dataset, val_dataset, test_dataset = create_datasets(
        train_df, val_df, test_df, tokenizer, config
    )
    del train_df, val_df, test_df
    gc.collect()

    training_args = TrainingArguments(
        output_dir=config.OUTPUT_DIR,
        num_train_epochs=config.NUM_EPOCHS,
        per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=config.EVAL_BATCH_SIZE,
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        learning_rate=config.LEARNING_RATE,
        warmup_ratio=config.WARMUP_RATIO,
        weight_decay=config.WEIGHT_DECAY,
        logging_dir=f"{config.OUTPUT_DIR}/logs",
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        fp16=config.FP16,
        dataloader_num_workers=config.NUM_WORKERS,
        save_total_limit=1,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("\nStarting Training...")
    trainer.train()

    print("\nEvaluation Results:")
    print(trainer.evaluate())
    print("\nTest Results:")
    print(trainer.evaluate(test_dataset))

    print("\nSaving Model...")
    trainer.save_model(config.OUTPUT_DIR)
    tokenizer.save_pretrained(config.OUTPUT_DIR)
    print("Done.")
    return trainer


if __name__ == "__main__":
    config = Config()
    trainer = train_model(config)

Turkish BERT Sentiment Analysis Training (A100 Optimized)
----------------------------------------------------------------------
Device: cuda
GPU: NVIDIA A100-SXM4-40GB
Data Loading
----------------------------------------------------------------------

Loading: /content/temiz_veri_seti.xlsx
Excel formatƒ± algƒ±landƒ±, pd.read_excel kullanƒ±lƒ±yor...
Total Rows: 488,872

Mapping labels...
Class Distribution:
label
0     56275
1    261681
2    170916
Name: count, dtype: int64

Data Splitting
----------------------------------------------------------------------
Train: 391,096
Val:   48,888
Test:  48,888

Model Loading


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Tokenization
----------------------------------------------------------------------


Map:   0%|          | 0/391096 [00:00<?, ? examples/s]

Map:   0%|          | 0/48888 [00:00<?, ? examples/s]

Map:   0%|          | 0/48888 [00:00<?, ? examples/s]


Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.0814,0.080361,0.97257,0.951801,0.957258,0.946686
2,0.067,0.076141,0.974656,0.955309,0.961116,0.949886
3,0.0456,0.080629,0.974963,0.955591,0.963605,0.948276



Evaluation Results:


{'eval_loss': 0.08062899112701416, 'eval_accuracy': 0.9749631811487481, 'eval_f1_macro': 0.9555906910484887, 'eval_precision_macro': 0.9636053968677939, 'eval_recall_macro': 0.9482764580902473, 'eval_runtime': 9.652, 'eval_samples_per_second': 5065.075, 'eval_steps_per_second': 9.946, 'epoch': 3.0}

Test Results:
{'eval_loss': 0.0804213359951973, 'eval_accuracy': 0.9743086237931599, 'eval_f1_macro': 0.9544951448856073, 'eval_precision_macro': 0.9624020504282188, 'eval_recall_macro': 0.9472715588827336, 'eval_runtime': 9.6564, 'eval_samples_per_second': 5062.77, 'eval_steps_per_second': 9.942, 'epoch': 3.0}

Saving Model...
Done.


In [None]:
# 1. Google Drive'ƒ± Baƒüla (ƒ∞zin isteyecek, onayla)
from google.colab import drive
drive.mount('/content/drive')

# 2. Modeli Drive'a 'tez_model_v1' adƒ±yla kopyala
!cp -r ./turkish_sentiment_model /content/drive/MyDrive/tez_model_v1

print("‚úÖ Model Google Drive'a ba≈üarƒ±yla yedeklendi: /content/drive/MyDrive/tez_model_v1")

Mounted at /content/drive
‚úÖ Model Google Drive'a ba≈üarƒ±yla yedeklendi: /content/drive/MyDrive/tez_model_v1


In [None]:
#Kaggle √ºzerindeki veri setini d√ºzenleyelim. (Dataset s√ºtunu kaldƒ±rƒ±lacak, label sayƒ±salla≈ütƒ±rƒ±lacak.)

!pip install datasets pandas openpyxl -q

from datasets import load_dataset
import pandas as pd

DATASET_NAME = "winvoker/turkish-sentiment-analysis-dataset"

print(f"{DATASET_NAME} indiriliyor...")
dataset = load_dataset(DATASET_NAME)

dfs = []
for split in dataset.keys():
    dfs.append(pd.DataFrame(dataset[split]))
df = pd.concat(dfs, ignore_index=True)

print(f"ƒ∞lk S√ºtunlar: {df.columns.tolist()}")

if 'dataset' in df.columns:
    df = df.drop(columns=['dataset'])
    print("'dataset' s√ºtunu silindi.")

df = df.rename(columns={
    "text": "Yorum",
    "label": "Duygu"
})

label_map = {
    'Negative': 0, 'negative': 0, 'Olumsuz': 0,
    'Positive': 1, 'positive': 1, 'Olumlu': 1,
    'Neutral': 2,  'neutral': 2,  'N√∂tr': 2, 'Notr': 2
}

if df['Duygu'].dtype == 'object':
    df['Duygu'] = df['Duygu'].map(label_map)
    print("Etiketler (positive/negative) sayƒ±sal formata (0/1/2) √ßevrildi.")

df = df.dropna(subset=['Duygu', 'Yorum'])
df['Duygu'] = df['Duygu'].astype(int)

print("-" * 30)
print("Son Durum:")
print(df.head())
print("\nEtiket Daƒüƒ±lƒ±mƒ±:\n", df['Duygu'].value_counts())

dosya_adi = "temiz_veri_seti.xlsx"
df.to_excel(dosya_adi, index=False)

print(f"\nTAMAMLANDI! '{dosya_adi}' dosyasƒ±nƒ± sol men√ºden indirip eƒüitimde kullanabilirsin.")

winvoker/turkish-sentiment-analysis-dataset indiriliyor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/76.1M [00:00<?, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/440679 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/48965 [00:00<?, ? examples/s]

ƒ∞lk S√ºtunlar: ['text', 'label', 'dataset']
'dataset' s√ºtunu silindi.
Etiketler (positive/negative) sayƒ±sal formata (0/1/2) √ßevrildi.
------------------------------
Son Durum:
                                               Yorum  Duygu
0  √ºr√ºn√º hepsiburadadan alalƒ± 3 hafta oldu. orjin...      1
1  √ºr√ºnlerden √ßok memnunum, kesinlikle herkese ta...      1
2      hƒ±zlƒ± kargo, temiz alƒ±≈üveri≈ü.te≈üekk√ºr ederim.      1
3               √á√ºnk√º aranan tapƒ±nak bu b√∂lgededir .      2
4  bu telefonu ba≈ülƒ±ca alma nedenlerim ise elimde...      1

Etiket Daƒüƒ±lƒ±mƒ±:
 Duygu
1    262166
2    170917
0     56561
Name: count, dtype: int64

TAMAMLANDI! 'temiz_veri_seti.xlsx' dosyasƒ±nƒ± sol men√ºden indirip eƒüitimde kullanabilirsin.


In [None]:
import pandas as pd

sutun_isimleri = ["ID", "Category", "Review", "Sentiment", "Split", "Label"]

try:
    df = pd.read_csv('/content/HUMIRSentimentDatasets.csv',
                     header=None,          # ƒ∞lk satƒ±rƒ± ba≈ülƒ±k yapma, veri olarak al
                     names=sutun_isimleri, # S√ºtun isimlerini biz veriyoruz
                     sep=None,             # Ayƒ±rƒ±cƒ±yƒ± (virg√ºl m√º tab mƒ±) otomatik algƒ±la
                     engine='python',      # Python motoru hatalara kar≈üƒ± daha esnektir
                     on_bad_lines='skip')  # Hatalƒ± olan o 29. satƒ±rƒ± atla ve devam et

    print("‚úÖ Veri ba≈üarƒ±yla y√ºklendi!")
    print(df.head())
    print(f"\nToplam Okunan Satƒ±r Sayƒ±sƒ±: {len(df)}")

except Exception as e:
    print(f"‚ùå Hata: {e}")

‚úÖ Veri ba≈üarƒ±yla y√ºklendi!
       ID      Category                                             Review  \
0  Ôªø80101  Hotel Review  asla gidilmeyecek bir otel hasta oldukotel tam...   
1   80102  Hotel Review  berbatbalayƒ± i√ßin tercih etmi≈ü olduƒüumuz bir o...   
2   80103  Hotel Review  rexaletodalar felaket hamamb√∂cekleri toz lobby...   
3   80104  Hotel Review  berbat personeliyim otelin mutlaka okuyunmerha...   
4   80105  Hotel Review  sƒ±kƒ±cƒ± a√ß√∂zel sekt√∂r √ßalƒ±≈üanƒ± olduƒüum ve √∂n√ºm√º...   

  Sentiment  Split  Label  
0  Negative  train      1  
1  Negative  train      1  
2  Negative  train      1  
3  Negative  train      1  
4  Negative  train      1  

Toplam Okunan Satƒ±r Sayƒ±sƒ±: 65000


In [None]:
print("--- Veri Seti B√∂l√ºmleri (Split) ---")
print(df['Split'].value_counts())

print("\n--- Duygu Daƒüƒ±lƒ±mƒ± (Sentiment) ---")
print(df['Sentiment'].value_counts())

print("\n--- Etiket Daƒüƒ±lƒ±mƒ± (Label) ---")
print(df['Label'].value_counts())

# Eksik veri kontrol√º (√∂zellikle Review s√ºtununda)
print(f"\nBo≈ü Yorum Sayƒ±sƒ±: {df['Review'].isnull().sum()}")

--- Veri Seti B√∂l√ºmleri (Split) ---
Split
train    32500
test     32500
Name: count, dtype: int64

--- Duygu Daƒüƒ±lƒ±mƒ± (Sentiment) ---
Sentiment
Negative    32500
Positive    32500
Name: count, dtype: int64

--- Etiket Daƒüƒ±lƒ±mƒ± (Label) ---
Label
1    22280
2    10680
3    10680
4    10680
5    10680
Name: count, dtype: int64

Bo≈ü Yorum Sayƒ±sƒ±: 0


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# --- 1. DOSYAYI OKUMA (En ba≈ütaki adƒ±m) ---
print("1. Dosya okunuyor...")
sutun_isimleri = ["ID", "Category", "Review", "Sentiment", "Split", "Label"]
try:
    df = pd.read_csv('/content/HUMIRSentimentDatasets.csv',
                     header=None,
                     names=sutun_isimleri,
                     sep=None,
                     engine='python',
                     on_bad_lines='skip')
    print("Dosya ba≈üarƒ±yla y√ºklendi.")
except Exception as e:
    print(f" Dosya okunurken hata: {e}")

# --- 2. VERƒ∞Yƒ∞ HAZIRLAMA ---
print("2. Veriler d√ºzenleniyor...")
# Etiketleri Sayƒ±ya √áevirme
label_map = {'Negative': 0, 'Positive': 1}
df['target'] = df['Sentiment'].map(label_map)

# Train ve Test olarak ayƒ±rma (Split s√ºtununu kullanarak)
train_raw = df[df['Split'] == 'train'].copy()
test_df = df[df['Split'] == 'test'].copy()

# Validation olu≈üturma (Train i√ßinden %10 ayƒ±rƒ±yoruz)
train_df, val_df = train_test_split(train_raw, test_size=0.1, random_state=42, stratify=train_raw['target'])

# Tokenizer'ƒ± y√ºkleme
MODEL_NAME = "dbmdz/bert-base-turkish-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(" Tokenizer hazƒ±r.")

# --- 3. DATASET CLASS ve LOADER ---
print("3. Veri Y√ºkleyiciler olu≈üturuluyor...")

class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        review = str(self.df.iloc[index]['Review'])
        target = self.df.iloc[index]['target']

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

# Ayarlar
MAX_LEN = 160
BATCH_SIZE = 16

# Datasetleri Olu≈ütur
train_dataset = ReviewDataset(train_df, tokenizer, MAX_LEN)
val_dataset = ReviewDataset(val_df, tokenizer, MAX_LEN)
test_dataset = ReviewDataset(test_df, tokenizer, MAX_LEN)

# Loaderlarƒ± Olu≈ütur
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print("‚úÖ HER ≈ûEY HAZIR! Eƒüitime ge√ßebiliriz.")

1. Dosya okunuyor...
‚úÖ Dosya ba≈üarƒ±yla y√ºklendi.
2. Veriler d√ºzenleniyor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

‚úÖ Tokenizer hazƒ±r.
3. Veri Y√ºkleyiciler olu≈üturuluyor...
‚úÖ HER ≈ûEY HAZIR! Eƒüitime ge√ßebiliriz.


In [None]:
"""
BERT/RoBERTa Fine-Tuning Script - A100 Optimized
 3 sƒ±nƒ±flƒ± modeli 2 sƒ±nƒ±fa uyarlar
 Yeni model olarak kaydeder (tez_model_v2)
 A100 GPU i√ßin optimize edilmi≈ü
 Hata kontrol√º %100
"""

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from google.colab import drive
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW  #  D√úZELTME: transformers'tan deƒüil torch'tan import
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score,
    classification_report,
    confusion_matrix
)
import warnings
import json
import os
from datetime import datetime
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')


CONFIG = {
    'ORIGINAL_MODEL_NAME': 'tez_model_v1',    # Drive'daki mevcut model
    'NEW_MODEL_NAME': 'tez_model_v2',         # Yeni model adƒ±

    'NUM_LABELS': 2,                          # Yeni veri setindeki sƒ±nƒ±f sayƒ±sƒ±
    'LABEL_NAMES': ['Negative', 'Positive'],  # Sƒ±nƒ±f isimleri

    'EPOCHS': 3,
    'BATCH_SIZE': None,  # None = otomatik tespit (val_loader'dan alƒ±r)
    'LEARNING_RATE': 2e-5,
    'WARMUP_RATIO': 0.1,
    'MAX_GRAD_NORM': 1.0,
    'WEIGHT_DECAY': 0.01,

    'AUTO_SAVE': True,                        # True = otomatik kaydet
    'SAVE_BEST_ONLY': False,                  # True = sadece en iyi modeli kaydet
}

print("="*70)
print(" BERT/RoBERTa FINE-TUNING BA≈ûLIYOR")
print("="*70)
print(f" Kaynak model    : {CONFIG['ORIGINAL_MODEL_NAME']}")
print(f" Hedef model     : {CONFIG['NEW_MODEL_NAME']}")
print(f" Sƒ±nƒ±f sayƒ±sƒ±    : 3 ‚Üí {CONFIG['NUM_LABELS']}")
print("="*70)

# =============================================================================
# 0. G√úVENLƒ∞K KONTROLLERI
# =============================================================================

# Gerekli deƒüi≈ükenleri kontrol et
required_vars = {
    'train_loader': 'Eƒüitim veri y√ºkleyici',
    'val_loader': 'Validasyon veri y√ºkleyici',
    'train_df': 'Eƒüitim dataframe'
}

missing = []
for var_name, description in required_vars.items():
    if var_name not in dir():
        missing.append(f" {var_name} ({description})")
    else:
        print(f"  ‚úÖ {var_name} bulundu")

if missing:
    print("\n HATA: A≈üaƒüƒ±daki deƒüi≈ükenler bulunamadƒ±:")
    print("\n".join(missing))
    print("\n√á√ñZ√úM: √ñnce veri hazƒ±rlama kodunu √ßalƒ±≈ütƒ±rƒ±n!")
    raise SystemExit(1)

# Batch size'ƒ± otomatik tespit et
if CONFIG['BATCH_SIZE'] is None:
    try:
        CONFIG['BATCH_SIZE'] = train_loader.batch_size
        print(f"  Batch size otomatik tespit edildi: {CONFIG['BATCH_SIZE']}")
    except:
        CONFIG['BATCH_SIZE'] = 16
        print(f"  Batch size tespit edilemedi, varsayƒ±lan: {CONFIG['BATCH_SIZE']}")

print("\n T√ºm kontroller ba≈üarƒ±lƒ±!")


print("\n Google Drive baƒülanƒ±yor...")
try:
    drive.mount('/content/drive', force_remount=False)
    print("Drive baƒülandƒ±")
except Exception as e:
    print(f" Drive baƒülantƒ± uyarƒ±sƒ±: {e}")
    print("   (Zaten baƒülƒ±ysa sorun yok)")

print("\n GPU kontrol ediliyor...")
if not torch.cuda.is_available():
    print(" HATA: GPU bulunamadƒ±!")
    print(" √á√ñZ√úM: Runtime > Change runtime type > T4/A100 GPU se√ßin")
    raise SystemExit(1)

device = torch.device("cuda")
print(f"GPU aktif: {torch.cuda.get_device_name(0)}")
print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# A100 i√ßin √∂zel optimizasyonlar
if "A100" in torch.cuda.get_device_name(0):
    print("A100 tespit edildi - TF32 optimizasyonlarƒ± aktif!")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

original_path = f"/content/drive/MyDrive/{CONFIG['ORIGINAL_MODEL_NAME']}"
print(f"\n Model y√ºkleniyor: {original_path}")

# Modelin var olduƒüunu kontrol et
if not os.path.exists(original_path):
    print(f" HATA: Model klas√∂r√º bulunamadƒ±!")
    print(f"   Aranan konum: {original_path}")
    print("\n √á√ñZ√úM: Drive'ƒ±nƒ±zda ≈üu klas√∂r var mƒ± kontrol edin:")
    print(f"   MyDrive/{CONFIG['ORIGINAL_MODEL_NAME']}")
    raise SystemExit(1)

try:
    # Tokenizer'ƒ± y√ºkle
    print("  ‚Üí Tokenizer y√ºkleniyor...")
    tokenizer = AutoTokenizer.from_pretrained(original_path)

    # Modeli y√ºkle ve 2 sƒ±nƒ±fa uyarla
    print("  ‚Üí Model y√ºkleniyor ve 2 sƒ±nƒ±fa uyarlanƒ±yor...")
    print("     (3 sƒ±nƒ±flƒ± classifier katmanƒ± ‚Üí 2 sƒ±nƒ±flƒ± olarak yeniden olu≈üturuluyor)")

    model = AutoModelForSequenceClassification.from_pretrained(
        original_path,
        num_labels=CONFIG['NUM_LABELS'],
        ignore_mismatched_sizes=True  # Bu satƒ±r 3‚Üí2 uyarlamasƒ±nƒ± otomatik yapar
    )

    model = model.to(device)

    print("\n Model ba≈üarƒ±yla y√ºklendi ve uyarlandƒ±!")
    print(f"   Model tipi: {model.config.model_type.upper()}")
    print(f"   Sƒ±nƒ±f sayƒ±sƒ±: {model.num_labels}")
    print(f"   Toplam parametre: {sum(p.numel() for p in model.parameters()):,}")
    print(f"   Eƒüitilebilir parametre: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

except Exception as e:
    print(f"\n Model y√ºkleme hatasƒ±!")
    print(f"   Hata: {e}")
    raise SystemExit(1)


def get_predictions(model, data_loader, device):
    """Model tahminlerini ve ger√ßek deƒüerleri d√∂nd√ºr√ºr"""
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Tahmin yapƒ±lƒ±yor", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    return true_labels, predictions


def calculate_metrics(y_true, y_pred):
    """Metrikleri hesapla"""
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0)
    }


def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, device):
    """Bir epoch eƒüitim"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(data_loader, desc="Eƒüitim", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["targets"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=CONFIG['MAX_GRAD_NORM'])
        optimizer.step()
        scheduler.step()

        # Metrikleri hesapla
        _, preds = torch.max(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item()

        # Progress bar g√ºncelle
        progress_bar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'acc': f"{correct/total:.4f}"
        })

    return total_loss / len(data_loader), correct / total


def eval_epoch(model, data_loader, loss_fn, device):
    """Validasyon epoch"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validasyon", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)

            _, preds = torch.max(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()

    return total_loss / len(data_loader), correct / total

print("\n" + "="*70)
print(" BASELINE PERFORMANS (Eƒüitim √ñncesi)")
print("="*70)
print(" Model performansƒ± √∂l√ß√ºl√ºyor...")

y_true, y_pred_baseline = get_predictions(model, val_loader, device)
baseline_metrics = calculate_metrics(y_true, y_pred_baseline)

print("\n Sonu√ßlar:")
for metric, value in baseline_metrics.items():
    print(f"  {metric.upper():12s}: {value:.4f}")


print("\n" + "="*70)
print("üéì Eƒûƒ∞Tƒ∞M HAZIRLIƒûI")
print("="*70)

# Optimizer
optimizer = AdamW(
    model.parameters(),
    lr=CONFIG['LEARNING_RATE'],
    eps=1e-8,
    weight_decay=CONFIG['WEIGHT_DECAY']
)

# Scheduler
total_steps = len(train_loader) * CONFIG['EPOCHS']
warmup_steps = int(CONFIG['WARMUP_RATIO'] * total_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Loss function
loss_fn = nn.CrossEntropyLoss().to(device)

print(f" Optimizer hazƒ±r (AdamW, lr={CONFIG['LEARNING_RATE']})")
print(f" Scheduler hazƒ±r (warmup={warmup_steps} steps)")
print(f" Loss function hazƒ±r (CrossEntropyLoss)")
print(f"\n Eƒüitim detaylarƒ±:")
print(f"  ‚Ä¢ Epoch sayƒ±sƒ±: {CONFIG['EPOCHS']}")
print(f"  ‚Ä¢ Batch size: {CONFIG['BATCH_SIZE']}")
print(f"  ‚Ä¢ Toplam step: {total_steps}")
print(f"  ‚Ä¢ Warmup step: {warmup_steps}")


print("\n" + "="*70)
print(" Eƒûƒ∞Tƒ∞M BA≈ûLIYOR")
print("="*70)

history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

best_val_acc = 0.0

for epoch in range(CONFIG['EPOCHS']):
    print(f"\nüìÖ EPOCH {epoch + 1}/{CONFIG['EPOCHS']}")
    print("-" * 70)

    # Training
    train_loss, train_acc = train_epoch(
        model, train_loader, loss_fn, optimizer, scheduler, device
    )

    # Validation
    val_loss, val_acc = eval_epoch(model, val_loader, loss_fn, device)

    # Kayƒ±t
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    # Best model tracking
    is_best = val_acc > best_val_acc
    if is_best:
        best_val_acc = val_acc

    # Sonu√ßlarƒ± yazdƒ±r
    print(f"\n{'':20s} Loss      Accuracy")
    print(f"  Train:         {train_loss:8.4f}  {train_acc:8.4f}")
    print(f"  Validation:    {val_loss:8.4f}  {val_acc:8.4f} {'üåü BEST!' if is_best else ''}")

print("\n‚úÖ Eƒüitim tamamlandƒ±!")



print("\n" + "="*70)
print(" Fƒ∞NAL PERFORMANS (Eƒüitim Sonrasƒ±)")
print("="*70)
print(" Model performansƒ± √∂l√ß√ºl√ºyor...")

y_true, y_pred_final = get_predictions(model, val_loader, device)
final_metrics = calculate_metrics(y_true, y_pred_final)

print("\nüìà Sonu√ßlar:")
for metric, value in final_metrics.items():
    print(f"  {metric.upper():12s}: {value:.4f}")



print("\n" + "="*70)
print(" BASELINE vs Fƒ∞NAL KAR≈ûILA≈ûTIRMA")
print("="*70)

comparison = pd.DataFrame({
    'Metrik': ['Accuracy', 'F1 Score', 'Recall', 'Precision'],
    'Baseline': [baseline_metrics[k] for k in ['accuracy', 'f1', 'recall', 'precision']],
    'Final': [final_metrics[k] for k in ['accuracy', 'f1', 'recall', 'precision']]
})

comparison['Fark'] = comparison['Final'] - comparison['Baseline']
comparison['Deƒüi≈üim (%)'] = ((comparison['Final'] / comparison['Baseline']) - 1) * 100

print(comparison.to_string(index=False, formatters={
    'Baseline': '{:.4f}'.format,
    'Final': '{:.4f}'.format,
    'Fark': '{:+.4f}'.format,
    'Deƒüi≈üim (%)': '{:+.2f}%'.format
}))

# Detaylƒ± rapor
print("\n" + "="*70)
print("DETAYLI SINIFLANDIRMA RAPORU")
print("="*70)
print(classification_report(y_true, y_pred_final,
                          target_names=CONFIG['LABEL_NAMES'],
                          digits=4))

# Confusion Matrix
print(" Confusion Matrix:")
cm = confusion_matrix(y_true, y_pred_final)
print(cm)



save_model = CONFIG['AUTO_SAVE']

if not save_model:
    response = input("\nüíæ Modeli kaydetmek ister misiniz? (e/h): ").lower()
    save_model = (response == 'e')

if save_model:
    print("\n" + "="*70)
    print(" MODEL KAYDEDƒ∞Lƒ∞YOR")
    print("="*70)

    save_path = f"/content/drive/MyDrive/{CONFIG['NEW_MODEL_NAME']}"

    # Klas√∂r olu≈ütur
    os.makedirs(save_path, exist_ok=True)

    # Model ve tokenizer'ƒ± kaydet
    print(f" Kaydediliyor: {save_path}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Config dosyasƒ± kaydet
    config_save = CONFIG.copy()
    config_save['training_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    config_save['baseline_metrics'] = baseline_metrics
    config_save['final_metrics'] = final_metrics
    config_save['training_history'] = history
    config_save['gpu_used'] = torch.cuda.get_device_name(0)

    with open(f"{save_path}/training_info.json", 'w', encoding='utf-8') as f:
        json.dump(config_save, f, ensure_ascii=False, indent=2)

    print(" Model kaydedildi!")
    print(f" Eƒüitim bilgileri kaydedildi: training_info.json")

    print("\n Kaydedilen dosyalar:")
    print(f"  ‚Ä¢ Model: {save_path}/pytorch_model.bin")
    print(f"  ‚Ä¢ Config: {save_path}/config.json")
    print(f"  ‚Ä¢ Tokenizer: {save_path}/tokenizer_config.json")
    print(f"  ‚Ä¢ Training Info: {save_path}/training_info.json")
else:
    print("\n‚è≠ Model kaydedilmedi")


print("\n" + "="*70)
print(" ƒ∞≈ûLEM TAMAMLANDI!")
print("="*70)
print(f" Kaynak model: {CONFIG['ORIGINAL_MODEL_NAME']}")
print(f" Yeni model: {CONFIG['NEW_MODEL_NAME']}")
print(f" Sƒ±nƒ±f sayƒ±sƒ±: 3 ‚Üí {CONFIG['NUM_LABELS']}")
print(f" Final Accuracy: {final_metrics['accuracy']:.4f}")
print(f" Geli≈üme: {comparison[comparison['Metrik']=='Accuracy']['Deƒüi≈üim (%)'].values[0]:+.2f}%")

if save_model:
    print(f"\n Model ≈üurada: MyDrive/{CONFIG['NEW_MODEL_NAME']}")
    print(" Bir sonraki eƒüitim i√ßin: 'tez_model_v2' ‚Üí 'tez_model_v3'")

print("\n Tebrikler! Her ≈üey ba≈üarƒ±lƒ±!")
print("="*70)

üöÄ BERT/RoBERTa FINE-TUNING BA≈ûLIYOR
üì• Kaynak model    : tez_model_v1
üì§ Hedef model     : tez_model_v2
üéØ Sƒ±nƒ±f sayƒ±sƒ±    : 3 ‚Üí 2

üîç Sistem kontrolleri yapƒ±lƒ±yor...
  ‚úÖ train_loader bulundu
  ‚úÖ val_loader bulundu
  ‚úÖ train_df bulundu
  ‚úÖ Batch size otomatik tespit edildi: 16

‚úÖ T√ºm kontroller ba≈üarƒ±lƒ±!

üì° Google Drive baƒülanƒ±yor...
Mounted at /content/drive
‚úÖ Drive baƒülandƒ±

üñ•Ô∏è GPU kontrol ediliyor...
‚úÖ GPU aktif: NVIDIA A100-SXM4-40GB
   VRAM: 42.5 GB
üöÄ A100 tespit edildi - TF32 optimizasyonlarƒ± aktif!

üì• Model y√ºkleniyor: /content/drive/MyDrive/tez_model_v1
  ‚Üí Tokenizer y√ºkleniyor...
  ‚Üí Model y√ºkleniyor ve 2 sƒ±nƒ±fa uyarlanƒ±yor...
     (3 sƒ±nƒ±flƒ± classifier katmanƒ± ‚Üí 2 sƒ±nƒ±flƒ± olarak yeniden olu≈üturuluyor)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/tez_model_v1 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



‚úÖ Model ba≈üarƒ±yla y√ºklendi ve uyarlandƒ±!
   Model tipi: BERT
   Sƒ±nƒ±f sayƒ±sƒ±: 2
   Toplam parametre: 110,618,882
   Eƒüitilebilir parametre: 110,618,882

üìä BASELINE PERFORMANS (Eƒüitim √ñncesi)
‚è≥ Model performansƒ± √∂l√ß√ºl√ºyor...


Tahmin yapƒ±lƒ±yor:   0%|          | 0/204 [00:00<?, ?it/s]


üìà Sonu√ßlar:
  ACCURACY    : 0.4868
  F1          : 0.4121
  RECALL      : 0.4868
  PRECISION   : 0.4731

üéì Eƒûƒ∞Tƒ∞M HAZIRLIƒûI
‚úÖ Optimizer hazƒ±r (AdamW, lr=2e-05)
‚úÖ Scheduler hazƒ±r (warmup=548 steps)
‚úÖ Loss function hazƒ±r (CrossEntropyLoss)

üìä Eƒüitim detaylarƒ±:
  ‚Ä¢ Epoch sayƒ±sƒ±: 3
  ‚Ä¢ Batch size: 16
  ‚Ä¢ Toplam step: 5487
  ‚Ä¢ Warmup step: 548

üöÄ Eƒûƒ∞Tƒ∞M BA≈ûLIYOR

üìÖ EPOCH 1/3
----------------------------------------------------------------------


Eƒüitim:   0%|          | 0/1829 [00:00<?, ?it/s]

Validasyon:   0%|          | 0/204 [00:00<?, ?it/s]


                     Loss      Accuracy
  Train:           0.4107    0.8013
  Validation:      0.3861    0.8637 üåü BEST!

üìÖ EPOCH 2/3
----------------------------------------------------------------------


Eƒüitim:   0%|          | 0/1829 [00:00<?, ?it/s]

Validasyon:   0%|          | 0/204 [00:00<?, ?it/s]


                     Loss      Accuracy
  Train:           0.2281    0.9150
  Validation:      0.3035    0.8960 üåü BEST!

üìÖ EPOCH 3/3
----------------------------------------------------------------------


Eƒüitim:   0%|          | 0/1829 [00:00<?, ?it/s]

Validasyon:   0%|          | 0/204 [00:00<?, ?it/s]


                     Loss      Accuracy
  Train:           0.1570    0.9524
  Validation:      0.4005    0.8960 

‚úÖ Eƒüitim tamamlandƒ±!

üìä Fƒ∞NAL PERFORMANS (Eƒüitim Sonrasƒ±)
‚è≥ Model performansƒ± √∂l√ß√ºl√ºyor...


Tahmin yapƒ±lƒ±yor:   0%|          | 0/204 [00:00<?, ?it/s]


üìà Sonu√ßlar:
  ACCURACY    : 0.8960
  F1          : 0.8960
  RECALL      : 0.8960
  PRECISION   : 0.8962

üìä BASELINE vs Fƒ∞NAL KAR≈ûILA≈ûTIRMA
   Metrik Baseline  Final    Fark Deƒüi≈üim (%)
 Accuracy   0.4868 0.8960 +0.4092     +84.07%
 F1 Score   0.4121 0.8960 +0.4839    +117.40%
   Recall   0.4868 0.8960 +0.4092     +84.07%
Precision   0.4731 0.8962 +0.4230     +89.41%

üìã DETAYLI SINIFLANDIRMA RAPORU
              precision    recall  f1-score   support

    Negative     0.9040    0.8862    0.8950      1625
    Positive     0.8884    0.9058    0.8970      1625

    accuracy                         0.8960      3250
   macro avg     0.8962    0.8960    0.8960      3250
weighted avg     0.8962    0.8960    0.8960      3250

üî¢ Confusion Matrix:
[[1440  185]
 [ 153 1472]]

üíæ MODEL KAYDEDƒ∞Lƒ∞YOR
üìÅ Kaydediliyor: /content/drive/MyDrive/tez_model_v2
‚úÖ Model kaydedildi!
‚úÖ Eƒüitim bilgileri kaydedildi: training_info.json

üìÅ Kaydedilen dosyalar:
  ‚Ä¢ Model: /content/

In [None]:
import pandas as pd
import torch
import os

# 1. ADIM: VERƒ∞LERƒ∞ DENGELƒ∞ Bƒ∞R ≈ûEKƒ∞LDE HAZIRLAMA (HATA D√úZELTƒ∞LMƒ∞≈û)
print("Veriler okunuyor...")

# Ana veri setini oku
df_main = pd.read_csv('/content/HUMIRSentimentDatasets.csv',
                      header=None, names=["ID", "Category", "Review", "Sentiment", "Split", "Label"],
                      engine='python', on_bad_lines='skip')

# --- HATA √ñNLEME: Etiketleri temizle ve sayƒ±ya √ßevir ---
# Bazƒ± CSV dosyalarƒ±nda label kƒ±smƒ±nda bo≈üluk kalabiliyor, onlarƒ± temizleyelim
df_main['Label'] = df_main['Label'].astype(str).str.strip()
df_main['Label'] = pd.to_numeric(df_main['Label'], errors='coerce') # Sayƒ±ya √ßevir, hata varsa NaN yap
df_main = df_main.dropna(subset=['Label']) # Hatalƒ± satƒ±rlarƒ± sil
df_main['Label'] = df_main['Label'].astype(int)

# Pozitifi 2 yap (Senin 3 sƒ±nƒ±flƒ± sisteme uyum)
df_main['Label'] = df_main['Label'].replace({1: 2})

print("Ana veri seti daƒüƒ±lƒ±mƒ±:", df_main['Label'].value_counts().to_dict())

# --- K√ú√á√úLTME (DOWNSAMPLING) G√úVENLƒ∞ HALE GETƒ∞Rƒ∞LDƒ∞ ---
# Eƒüer sƒ±nƒ±fta 4000'den az veri varsa hata vermemesi i√ßin min() kullanƒ±yoruz
n_sample = 4000
neg_count = min(len(df_main[df_main['Label'] == 0]), n_sample)
pos_count = min(len(df_main[df_main['Label'] == 2]), n_sample)

df_neg_small = df_main[df_main['Label'] == 0].sample(n=neg_count, random_state=42)
df_pos_small = df_main[df_main['Label'] == 2].sample(n=pos_count, random_state=42)

# Senin 1000 verini oku
df_gold = pd.read_excel('/content/ironi_veri_seti.xlsx')
df_gold = df_gold[['Review', 'target']]
df_gold.columns = ['Review', 'Label']

# Senin verilerini 5 katƒ±na √ßƒ±kararak n√∂trleri g√º√ßlendir
df_gold_upsampled = pd.concat([df_gold] * 5, ignore_index=True)

# Birle≈ütirme
df_final = pd.concat([df_neg_small, df_pos_small, df_gold_upsampled], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"üìä Final Dengeli Daƒüƒ±lƒ±m:\n{df_final['Label'].value_counts()}")

Veriler okunuyor...
Ana veri seti daƒüƒ±lƒ±mƒ±: {}
üìä Final Dengeli Daƒüƒ±lƒ±m:
Label
0    1860
1    1805
2    1335
Name: count, dtype: int64


In [None]:
# --- MODELƒ∞ GER√áEK SINAVINA SOKUYORUZ ---
zor_test_listesi = [
    "√úr√ºn√ºn kutusu gri renkte ve √ºzerinde marka logosu var.", # Saf N√∂tr (Buna pozitif derse hala sƒ±kƒ±ntƒ± var demektir)
    "Kargom bug√ºn teslim edildi.", # Saf N√∂tr
    "Ma≈üallah kurye o kadar yava≈ü ki kaplumbaƒüalar yanƒ±ndan vƒ±z vƒ±z ge√ßiyor.", # Sert ƒ∞roni
    "ƒ∞nanƒ±lmaz bir hizmet, paramƒ±zla rezil olduk.", # √áok Net ƒ∞roni
    "Te≈üekk√ºrler, sayenizde b√ºt√ºn g√ºn bekledim.", # ƒ∞nce ƒ∞roni
    "Cihazƒ±n bataryasƒ± 5000 mAh kapasiteye sahip." # Saf Teknik Bilgi (N√∂tr)
]

print(" DERƒ∞N ANALƒ∞Z BA≈ûLIYOR...\n")
for t in zor_test_listesi:
    print(f"Metin: {t}\nTahmin: {tahmin_et(t)}\n{'-'*30}")

üîç DERƒ∞N ANALƒ∞Z BA≈ûLIYOR...

Metin: √úr√ºn√ºn kutusu gri renkte ve √ºzerinde marka logosu var.
Tahmin: N√ñTR üòê
------------------------------
Metin: Kargom bug√ºn teslim edildi.
Tahmin: N√ñTR üòê
------------------------------
Metin: Ma≈üallah kurye o kadar yava≈ü ki kaplumbaƒüalar yanƒ±ndan vƒ±z vƒ±z ge√ßiyor.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: ƒ∞nanƒ±lmaz bir hizmet, paramƒ±zla rezil olduk.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Te≈üekk√ºrler, sayenizde b√ºt√ºn g√ºn bekledim.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Cihazƒ±n bataryasƒ± 5000 mAh kapasiteye sahip.
Tahmin: N√ñTR üòê
------------------------------


In [None]:
# --- MODELƒ∞ TERLETEN EKSTRA ZOR TEST ---
akademik_test_listesi = [
    "Sayƒ±n yetkili, talebiniz ilgili birime iletilmi≈ü olup inceleme s√ºreci devam etmektedir.", # Resmi N√∂tr
    "Harika bir plan, hafta sonunu evde temizlik yaparak ge√ßireceƒüim i√ßin √ßok mutluyum!", # Sosyal ƒ∞roni
    "Kitabƒ±n sayfa sayƒ±sƒ± 320, kapak tasarƒ±mƒ± ise mat selefon kaplama.", # Saf Betimleyici N√∂tr
    "Aman ne g√ºzel, bir bu eksikti zaten, tam oldu.", # Kƒ±sa ve Keskin ƒ∞roni
    "Hizmet kalitesi beklentilerimin √ßok altƒ±nda kaldƒ±, hayal kƒ±rƒ±klƒ±ƒüƒ±.", # D√ºz Negatif (ƒ∞roni deƒüil, direkt ele≈ütiri)
    "Dokunmatik ekran hassasiyeti ve panel parlaklƒ±ƒüƒ± standart seviyede.", # Teknik N√∂tr
    "Muazzam bir zeka √∂rneƒüi, anahtarƒ± evin i√ßinde unutmayƒ± nasƒ±l ba≈üardƒ±n?", # Ki≈üisel ƒ∞roni
    "Hava sƒ±caklƒ±ƒüƒ± mevsim normallerinin 5 derece √ºzerinde seyrediyor." # Doƒüa/Haber N√∂tr
]

print(" AKADEMƒ∞K SORGULAMA BA≈ûLIYOR...\n")
for t in akademik_test_listesi:
    print(f"Metin: {t}\nTahmin: {tahmin_et(t)}\n{'-'*30}")

üî¨ AKADEMƒ∞K SORGULAMA BA≈ûLIYOR...

Metin: Sayƒ±n yetkili, talebiniz ilgili birime iletilmi≈ü olup inceleme s√ºreci devam etmektedir.
Tahmin: N√ñTR üòê
------------------------------
Metin: Harika bir plan, hafta sonunu evde temizlik yaparak ge√ßireceƒüim i√ßin √ßok mutluyum!
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Kitabƒ±n sayfa sayƒ±sƒ± 320, kapak tasarƒ±mƒ± ise mat selefon kaplama.
Tahmin: N√ñTR üòê
------------------------------
Metin: Aman ne g√ºzel, bir bu eksikti zaten, tam oldu.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Hizmet kalitesi beklentilerimin √ßok altƒ±nda kaldƒ±, hayal kƒ±rƒ±klƒ±ƒüƒ±.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Dokunmatik ekran hassasiyeti ve panel parlaklƒ±ƒüƒ± standart seviyede.
Tahmin: N√ñTR üòê
------------------------------
Metin: Muazzam bir zeka √∂rneƒüi, anahtarƒ± evin i√ßinde unutmayƒ± nasƒ±l ba≈üardƒ±n?
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
--------------

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- MODELƒ∞ SINIRLARINA ZORLAYAN ABS√úRT TEST ---
absurt_test_listesi = [
    "Ma≈üallah, atomu par√ßalayƒ±p i√ßine peynir ekmek koymu≈üsun, d√¢hice!", # Aƒüƒ±r alay/ƒ∞roni
    "Dikd√∂rtgen prizmanƒ±n i√ß a√ßƒ±larƒ± toplamƒ± ile ilgili bir makale okuyorum.", # A≈üƒ±rƒ± N√∂tr/Akademik
    "Tebrikler, be≈ü dakikalƒ±k yolu √º√ß saatte gelerek d√ºnya rekoru kƒ±rdƒ±n.", # Zaman ironisi
    "Mutfaktaki musluktan su akƒ±yor ve lavabonun rengi beyaz.", # √áok sa√ßma derecede n√∂tr
    "Harika bir i≈ü √ßƒ±kardƒ±n, sayende b√ºt√ºn proje √ß√∂p oldu ve herkes kovuldu.", # Yƒ±kƒ±cƒ± ƒ∞roni
    "Elektrikli s√ºp√ºrgenin emi≈ü g√ºc√º 2000 watt olup, kablo uzunluƒüu 5 metredir.", # Teknik N√∂tr
    "Vay be, kaplumbaƒüaya binseydik ≈üu an √ßoktan varmƒ±≈ütƒ±k, hƒ±zƒ±na hayran kaldƒ±m.", # Hƒ±z ironisi
    "Masadaki bardaƒüƒ±n i√ßinde yarƒ±m bardak ≈üeffaf su bulunuyor." # G√∂zlem N√∂tr
]

print(" ABS√úRT VE KARMA≈ûIK TEST BA≈ûLIYOR...\n")
for t in absurt_test_listesi:
    print(f"Metin: {t}\nTahmin: {tahmin_et(t)}\n{'-'*30}")

üß† ABS√úRT VE KARMA≈ûIK TEST BA≈ûLIYOR...

Metin: Ma≈üallah, atomu par√ßalayƒ±p i√ßine peynir ekmek koymu≈üsun, d√¢hice!
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Dikd√∂rtgen prizmanƒ±n i√ß a√ßƒ±larƒ± toplamƒ± ile ilgili bir makale okuyorum.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Tebrikler, be≈ü dakikalƒ±k yolu √º√ß saatte gelerek d√ºnya rekoru kƒ±rdƒ±n.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Mutfaktaki musluktan su akƒ±yor ve lavabonun rengi beyaz.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Harika bir i≈ü √ßƒ±kardƒ±n, sayende b√ºt√ºn proje √ß√∂p oldu ve herkes kovuldu.
Tahmin: ƒ∞RONƒ∞K / NEGATƒ∞F üò°
------------------------------
Metin: Elektrikli s√ºp√ºrgenin emi≈ü g√ºc√º 2000 watt olup, kablo uzunluƒüu 5 metredir.
Tahmin: N√ñTR üòê
------------------------------
Metin: Vay be, kaplumbaƒüaya binseydik ≈üu an √ßoktan varmƒ±≈ütƒ±k, hƒ±zƒ±na hayran kaldƒ±m.
Tahmi

In [None]:
import pandas as pd
import torch
import os
import sys
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ==========================================
# 1. GOOGLE DRIVE BAƒûLANTISI
# ==========================================
print("[INFO] Google Drive baƒülanƒ±yor...")
drive.mount('/content/drive')

# ==========================================
# 2. AYARLAR (CONFIGURATION)
# ==========================================

# Veri seti: Colab'e y√ºklediƒüin dosya
DATASET_PATH = "/content/golden_dataset.xlsx"

# Modeller: Drive'daki yollar
PREV_MODEL_PATH = "/content/drive/MyDrive/tez_model_v3_final"
OUTPUT_MODEL_PATH = "/content/drive/MyDrive/tez_model_v4_final"

# Egitim Parametreleri
MAX_LENGTH = 128
BATCH_SIZE = 32          # A100 icin optimize
EPOCHS = 3
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# ==========================================
# 3. DONANIM KONTROL√ú
# ==========================================
print("\n[INFO] Checking hardware resources...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f"[INFO] GPU Active: {torch.cuda.get_device_name(0)}")
else:
    print("[WARNING] GPU not found! Training will be slow.")

# ==========================================
# 4. √ñNCEKƒ∞ MODELƒ∞ (v3) Y√úKLE
# ==========================================
print(f"\n[INFO] Loading previous model from: {PREV_MODEL_PATH}")

if not os.path.exists(PREV_MODEL_PATH):
    print(f"[CRITICAL ERROR] Model klas√∂r√º bulunamadƒ±: {PREV_MODEL_PATH}")
    print("   L√ºtfen Drive'da 'tez_model_v3_final' klas√∂r√ºn√ºn olduƒüundan emin ol.")
    sys.exit(1)

try:
    tokenizer = AutoTokenizer.from_pretrained(PREV_MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(PREV_MODEL_PATH)
    print("[INFO] Model v3 loaded successfully.")
except Exception as e:
    print(f"[ERROR] Model y√ºklenirken hata olu≈ütu: {e}")
    sys.exit(1)

# ==========================================
# 5. VERƒ∞ SETƒ∞Nƒ∞ Y√úKLE VE ƒ∞≈ûLE
# ==========================================
print(f"\n[INFO] Loading dataset from: {DATASET_PATH}")

if not os.path.exists(DATASET_PATH):
    print(f"[ERROR] Veri seti bulunamadƒ±: {DATASET_PATH}")
    print("   L√ºtfen dosyayƒ± Colab'in sol tarafƒ±ndaki dosya y√∂neticisine y√ºklediƒüinden emin ol.")
    sys.exit(1)

# Okuma
if DATASET_PATH.endswith('.xlsx'):
    df = pd.read_excel(DATASET_PATH)
else:
    df = pd.read_csv(DATASET_PATH)

# S√ºtun ƒ∞simlerini D√ºzeltme
column_mapping = {
    'Review': 'text',
    'Sentiment': 'original_label',
    'Label': 'original_label'
}
df = df.rename(columns=column_mapping)

if 'text' not in df.columns or 'original_label' not in df.columns:
    print(f"[ERROR] Gerekli s√ºtunlar yok. Mevcutlar: {list(df.columns)}")
    sys.exit(1)

# Etiket D√∂n√º≈ü√ºm√º (0, 1, 2)
print("[INFO] Mapping labels...")

def map_sentiment(val):
    val = str(val).lower().strip()
    if val in ['negative', 'negatif', 'bad', 'k√∂t√º', '0', '-1']: return 0
    elif val in ['neutral', 'notr', 'n√∂tr', 'normal', '1']: return 1
    elif val in ['positive', 'pozitif', 'good', 'iyi', '2']: return 2
    return None

df['label'] = df['original_label'].apply(map_sentiment)
df = df.dropna(subset=['label', 'text'])
df['label'] = df['label'].astype(int)

print(f"[INFO] Data ready. Rows: {len(df)}")
print(f"[INFO] Class Distribution: {df['label'].value_counts().to_dict()}")

# ==========================================
# 6. Eƒûƒ∞Tƒ∞M HAZIRLIƒûI
# ==========================================

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.10,
    random_state=42
)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self): return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LENGTH)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# ==========================================
# 7. Eƒûƒ∞Tƒ∞Mƒ∞ BA≈ûLAT
# ==========================================

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

training_args = TrainingArguments(
    output_dir='./results_v4',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch",  # D√úZELTƒ∞LDƒ∞: evaluation_strategy -> eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("\n[INFO] Starting training session...")
trainer.train()

# ==========================================
# 8. KAYDET
# ==========================================
print(f"\n[INFO] Saving model to: {OUTPUT_MODEL_PATH}")
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

print("[INFO] Process completed successfully.")

[INFO] Google Drive baƒülanƒ±yor...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[INFO] Checking hardware resources...
[INFO] GPU Active: NVIDIA A100-SXM4-40GB

[INFO] Loading previous model from: /content/drive/MyDrive/tez_model_v3_final
[INFO] Model v3 loaded successfully.

[INFO] Loading dataset from: /content/golden_dataset.xlsx
[INFO] Mapping labels...
[INFO] Data ready. Rows: 7940
[INFO] Class Distribution: {0: 2844, 1: 2747, 2: 2349}

[INFO] Starting training session...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.007556,0.998741
2,No log,0.001994,0.998741
3,0.022700,0.00054,1.0



[INFO] Saving model to: /content/drive/MyDrive/tez_model_v4_final
[INFO] Process completed successfully.


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# 1. Kaydettiƒüimiz Modeli Y√ºkle
MODEL_PATH = "/content/drive/MyDrive/tez_model_v4_final"
print(f"‚è≥ Model y√ºkleniyor: {MODEL_PATH}")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
    print(" Model y√ºklendi ve teste hazƒ±r!")
except Exception as e:
    print(f" Hata: {e}")
    exit()

# 2. Zorlu Test C√ºmleleri (Bunlarƒ± modelin kafasƒ±nƒ± karƒ±≈ütƒ±rmak i√ßin se√ßtim)
test_sentences = [
    "√úr√ºn elime ula≈ütƒ± ama beklediƒüimden biraz farklƒ±, yine de i≈ü g√∂r√ºr.",  # N√∂tr veya Pozitif arasƒ± (Zor)
    "Hayatƒ±mda g√∂rd√ºƒü√ºm en rezalet paketleme, tebrik ederim ger√ßekten!",    # Negatif (ƒ∞roni i√ßeriyor)
    "Ne iyi ne k√∂t√º, standart bir √ºr√ºn.",                                  # N√∂tr
    "M√ºkemmel bir deneyimdi, herkese tavsiye ederim.",                     # Pozitif
    "Kargo hƒ±zlƒ±ydƒ± ama √ºr√ºn kƒ±rƒ±k geldi.",                                # Negatif
    "Satƒ±cƒ± √ßok ilgiliydi, te≈üekk√ºrler.",                                  # Pozitif
]

# 3. Tahminleri Yap
print("\n CANLI TEST SONU√áLARI:")
print("-" * 50)
label_map = {"LABEL_0": "Negatif", "LABEL_1": "N√∂tr", "LABEL_2": "Pozitif"}

for text in test_sentences:
    result = classifier(text)[0]
    label_code = result['label']
    score = result['score']

    human_readable = label_map.get(label_code, label_code)

    print(f" C√ºmle: {text}")
    print(f" Tahmin: {human_readable} (G√ºven: %{score*100:.2f})")
    print("-" * 50)

Device set to use cuda:0


‚è≥ Model y√ºkleniyor: /content/drive/MyDrive/tez_model_v4_final
 Model y√ºklendi ve teste hazƒ±r!

 CANLI TEST SONU√áLARI:
--------------------------------------------------
 C√ºmle: √úr√ºn elime ula≈ütƒ± ama beklediƒüimden biraz farklƒ±, yine de i≈ü g√∂r√ºr.
 Tahmin: Negatif (G√ºven: %99.50)
--------------------------------------------------
 C√ºmle: Hayatƒ±mda g√∂rd√ºƒü√ºm en rezalet paketleme, tebrik ederim ger√ßekten!
 Tahmin: Negatif (G√ºven: %99.91)
--------------------------------------------------
 C√ºmle: Ne iyi ne k√∂t√º, standart bir √ºr√ºn.
 Tahmin: Negatif (G√ºven: %99.51)
--------------------------------------------------
 C√ºmle: M√ºkemmel bir deneyimdi, herkese tavsiye ederim.
 Tahmin: Pozitif (G√ºven: %99.97)
--------------------------------------------------
 C√ºmle: Kargo hƒ±zlƒ±ydƒ± ama √ºr√ºn kƒ±rƒ±k geldi.
 Tahmin: Negatif (G√ºven: %99.89)
--------------------------------------------------
 C√ºmle: Satƒ±cƒ± √ßok ilgiliydi, te≈üekk√ºrler.
 Tahmin: Pozitif (G√ºven:

In [None]:
import pandas as pd
import torch
import os
import sys
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ==========================================
# 1. AYARLAR
# ==========================================
print("[INFO] Google Drive baƒülanƒ±yor...")
drive.mount('/content/drive')

# DOSYA YOLLARI
GOLDEN_DATA_PATH = "/content/golden_dataset.xlsx"
OLD_DATA_PATH    = "/content/HUMIRSentimentDatasets.csv"

# MODEL YOLLARI
PREV_MODEL_PATH = "/content/drive/MyDrive/tez_model_v3_final"
OUTPUT_MODEL_PATH = "/content/drive/MyDrive/tez_model_v5_balanced"

# ==========================================
# 2. VERƒ∞LERƒ∞ Y√úKLE VE Bƒ∞RLE≈ûTƒ∞R (D√úZELTƒ∞LDƒ∞)
# ==========================================
print("\n[INFO] Veri setleri birle≈ütiriliyor...")

# --- A) Golden Dataset Y√ºkle ---
if os.path.exists(GOLDEN_DATA_PATH):
    if GOLDEN_DATA_PATH.endswith('.xlsx'):
        df_gold = pd.read_excel(GOLDEN_DATA_PATH)
    else:
        df_gold = pd.read_csv(GOLDEN_DATA_PATH)

    # S√ºtunlarƒ± d√ºzelt
    df_gold = df_gold.rename(columns={'Review': 'text', 'Sentiment': 'original_label', 'Label': 'original_label'})
    df_gold = df_gold[['text', 'original_label']]
    print(f"   -> Golden Dataset: {len(df_gold)} satƒ±r eklendi.")
else:
    print(f"[ERROR] Golden Dataset bulunamadƒ±: {GOLDEN_DATA_PATH}")
    sys.exit(1)

# --- B) Eski (ƒ∞roni) Dataset Y√ºkle (SAƒûLAMLA≈ûTIRILDI) ---
df_old = pd.DataFrame() # Varsayƒ±lan bo≈ü

if os.path.exists(OLD_DATA_PATH):
    print(f"   -> HUMIR Dataset okunuyor ({OLD_DATA_PATH})...")
    try:
        # √ñnce en yaygƒ±n formatƒ± dene: Noktalƒ± virg√ºl ve UTF-8
        df_old = pd.read_csv(OLD_DATA_PATH, sep=';', encoding='utf-8', on_bad_lines='skip')
    except UnicodeDecodeError:
        try:
            # Olmazsa T√ºrk√ße Windows formatƒ±nƒ± dene
            df_old = pd.read_csv(OLD_DATA_PATH, sep=';', encoding='windows-1254', on_bad_lines='skip')
        except Exception as e:
            print(f"[WARNING] Okuma hatasƒ±: {e}")

    # Eƒüer yukarƒ±dakiler bo≈ü geldiyse veya s√ºtunlarƒ± bulamadƒ±ysa virg√ºl dene (ama bad_lines skip ile)
    if 'Review' not in df_old.columns and 'text' not in df_old.columns:
        print("      (Noktalƒ± virg√ºl i≈üe yaramadƒ±, virg√ºl deneniyor...)")
        try:
            df_old = pd.read_csv(OLD_DATA_PATH, sep=',', encoding='utf-8', on_bad_lines='skip')
        except:
            pass

    # S√ºtunlarƒ± Standartla≈ütƒ±r
    if 'Review' in df_old.columns: df_old = df_old.rename(columns={'Review': 'text'})
    if 'Label' in df_old.columns: df_old = df_old.rename(columns={'Label': 'original_label'})
    if 'target' in df_old.columns: df_old = df_old.rename(columns={'target': 'original_label'})

    # Gereksiz s√ºtunlarƒ± at ve kontrol et
    if 'text' in df_old.columns and 'original_label' in df_old.columns:
        df_old = df_old[['text', 'original_label']]
        print(f"   -> HUMIR (Eski) Dataset Ba≈üarƒ±yla Eklendi: {len(df_old)} satƒ±r.")
    else:
        print(f"[ERROR] Eski veri setinde 'text' veya 'label' s√ºtunu bulunamadƒ±! S√ºtunlar: {list(df_old.columns)}")
        print("      L√ºtfen CSV dosyanƒ±zƒ±n i√ßini kontrol edin. S√ºtun adlarƒ± 'Review' ve 'Label' olmalƒ±.")
        # Devam etmesin ki bo≈ü veriyle eƒüitilmesin
        df_old = pd.DataFrame()
else:
    print(f"[WARNING] Eski veri seti bulunamadƒ±: {OLD_DATA_PATH}")

# --- C

[INFO] Google Drive baƒülanƒ±yor...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[INFO] Veri setleri birle≈ütiriliyor...
   -> Golden Dataset: 7940 satƒ±r eklendi.
   -> HUMIR Dataset okunuyor (/content/HUMIRSentimentDatasets.csv)...
      (Noktalƒ± virg√ºl i≈üe yaramadƒ±, virg√ºl deneniyor...)
[ERROR] Eski veri setinde 'text' veya 'label' s√ºtunu bulunamadƒ±! S√ºtunlar: ['80101;Hotel Review;asla gidilmeyecek bir otel hasta oldukotel tam anlamƒ±yla bir fiyasko satƒ±n alƒ±rken ve web sitesinde g√∂z√ºn√ºze √ßarpan en b√ºy√ºk √∂zellik otelin t√ºm alanlarƒ±nƒ±n yenilenmi≈ü olmasƒ± ama bunun ger√ßekle alakasƒ± yok odalar en az  yƒ±llƒ±k bir otel harabeliƒüinde yemekler ve √∂zellikle kahvaltƒ± tam bir hayal kƒ±rƒ±klƒ±ƒüƒ± kahvaltƒ±daki yiyecekler asla yenmeyecek ve yedirilmeyecek kadar k√∂t√º bir tane lekesiz temiz bir tabak bardak veya √ßatal ka≈üƒ±k g√∂rmeniz olasƒ± bile deƒüil i√ßecek konusunda su deƒüil 

In [None]:
import pandas as pd
import torch
import os
import sys
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ==========================================
# 1. AYARLAR
# ==========================================
print("[INFO] Google Drive baƒülanƒ±yor...")
drive.mount('/content/drive')

# DOSYA YOLLARI
GOLDEN_DATA_PATH = "/content/golden_dataset.xlsx"
OLD_DATA_PATH    = "/content/HUMIRSentimentDatasets.csv"

# MODEL YOLLARI
PREV_MODEL_PATH = "/content/drive/MyDrive/tez_model_v3_final"
OUTPUT_MODEL_PATH = "/content/drive/MyDrive/tez_model_v5_balanced"

# ==========================================
# 2. VERƒ∞LERƒ∞ Y√úKLE VE AKILLI Bƒ∞RLE≈ûTƒ∞R
# ==========================================
print("\n[INFO] Veri setleri hazƒ±rlanƒ±yor...")

# --- A) Golden Dataset (N√∂tr Kalemiz) ---
if os.path.exists(GOLDEN_DATA_PATH):
    if GOLDEN_DATA_PATH.endswith('.xlsx'):
        df_gold = pd.read_excel(GOLDEN_DATA_PATH)
    else:
        df_gold = pd.read_csv(GOLDEN_DATA_PATH)

    df_gold = df_gold.rename(columns={'Review': 'text', 'Sentiment': 'original_label', 'Label': 'original_label'})
    df_gold = df_gold[['text', 'original_label']]
    print(f"   -> Golden Dataset: {len(df_gold)} satƒ±r (N√∂trler burada g√ºvende).")
else:
    print(f"[ERROR] Golden Dataset bulunamadƒ±.")
    sys.exit(1)

# --- B) HUMIR Dataset (Baharat) ---
if os.path.exists(OLD_DATA_PATH):
    print(f"   -> HUMIR Dataset okunuyor...")
    try:
        # Ba≈ülƒ±ksƒ±z okuma
        df_old = pd.read_csv(OLD_DATA_PATH, sep=';', header=None, usecols=[2, 3], on_bad_lines='skip')
        df_old.columns = ['text', 'original_label']

        # --- KRƒ∞Tƒ∞K DENGE AYARI ---
        # Senin uyarƒ±n √ºzerine burayƒ± d√ºzelttik.
        # Golden'ƒ±n toplam sayƒ±sƒ±nƒ±n sadece %40'ƒ± kadar eski veri alƒ±yoruz.
        # B√∂ylece N√∂tr sƒ±nƒ±fƒ± ezilmeyecek, ama model eski ironileri de hatƒ±rlayacak.

        ratio = 0.40  # %40 Oran (ƒ∞deal Denge)
        target_count = int(len(df_gold) * ratio)

        if len(df_old) > target_count:
            print(f"   ‚öñÔ∏è DENGELEME: N√∂tr sƒ±nƒ±fƒ±nƒ± korumak i√ßin eski veriden sadece {target_count} satƒ±r (Golden'ƒ±n %40'ƒ±) alƒ±nƒ±yor.")
            df_old = df_old.sample(n=target_count, random_state=42)

        print(f"   -> HUMIR Dataset Eklendi: {len(df_old)} satƒ±r.")

    except Exception as e:
        print(f"[ERROR] HUMIR okunamadƒ±: {e}")
        df_old = pd.DataFrame()
else:
    df_old = pd.DataFrame()

# --- C) Bƒ∞RLE≈ûTƒ∞RME ---
if not df_old.empty:
    df = pd.concat([df_gold, df_old], ignore_index=True)
else:
    df = df_gold

# Karƒ±≈ütƒ±r
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# ==========================================
# 3. ETƒ∞KETLERƒ∞ D√ñN√ú≈ûT√úR (0, 1, 2)
# ==========================================
print("[INFO] Etiketler i≈üleniyor...")

def map_sentiment(val):
    val = str(val).lower().strip()
    if val in ['negative', 'negatif', 'bad', 'k√∂t√º', '0', '-1', 'ironi', 'sarcasm']: return 0
    elif val in ['neutral', 'notr', 'n√∂tr', 'normal', '1']: return 1
    elif val in ['positive', 'pozitif', 'good', 'iyi', '2']: return 2
    return None

df['label'] = df['original_label'].apply(map_sentiment)
df = df.dropna(subset=['label', 'text'])
df['label'] = df['label'].astype(int)

# Sƒ±nƒ±f Daƒüƒ±lƒ±mƒ±nƒ± Kontrol Et (Burada N√∂tr sayƒ±sƒ± diƒüerlerine yakƒ±n olmalƒ±)
print(f"üìä [√ñNEMLƒ∞] Final Sƒ±nƒ±f Daƒüƒ±lƒ±mƒ±:\n{df['label'].value_counts().to_dict()}")

# ==========================================
# 4. Eƒûƒ∞Tƒ∞M (v3 √úZERƒ∞NE)
# ==========================================
print(f"\n[INFO] Model v3 y√ºkleniyor: {PREV_MODEL_PATH}")

if not os.path.exists(PREV_MODEL_PATH):
    print("[ERROR] v3 Modeli bulunamadƒ±.")
    sys.exit(1)

tokenizer = AutoTokenizer.from_pretrained(PREV_MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(PREV_MODEL_PATH, num_labels=3, ignore_mismatched_sizes=True)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.10, random_state=42
)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self): return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

training_args = TrainingArguments(
    output_dir='./results_v5',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

print("\n [v5] DENGELƒ∞ Eƒûƒ∞Tƒ∞M BA≈ûLIYOR...")
trainer.train()

print(f"\n Final Dengeli Model (v5) Kaydedildi: {OUTPUT_MODEL_PATH}")
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

[INFO] Google Drive baƒülanƒ±yor...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[INFO] Veri setleri hazƒ±rlanƒ±yor...
   -> Golden Dataset: 7940 satƒ±r (N√∂trler burada g√ºvende).
   -> HUMIR Dataset okunuyor...
   ‚öñÔ∏è DENGELEME: N√∂tr sƒ±nƒ±fƒ±nƒ± korumak i√ßin eski veriden sadece 3176 satƒ±r (Golden'ƒ±n %40'ƒ±) alƒ±nƒ±yor.
   -> HUMIR Dataset Eklendi: 3176 satƒ±r.
[INFO] Etiketler i≈üleniyor...
üìä [√ñNEMLƒ∞] Final Sƒ±nƒ±f Daƒüƒ±lƒ±mƒ±:
{0: 4462, 2: 3907, 1: 2747}

[INFO] Model v3 y√ºkleniyor: /content/drive/MyDrive/tez_model_v3_final

üöÄ [v5] DENGELƒ∞ Eƒûƒ∞Tƒ∞M BA≈ûLIYOR...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.041929,0.985612
2,0.061500,0.047779,0.988309
3,0.061500,0.055605,0.988309



‚úÖ Final Dengeli Model (v5) Kaydedildi: /content/drive/MyDrive/tez_model_v5_balanced


('/content/drive/MyDrive/tez_model_v5_balanced/tokenizer_config.json',
 '/content/drive/MyDrive/tez_model_v5_balanced/special_tokens_map.json',
 '/content/drive/MyDrive/tez_model_v5_balanced/vocab.txt',
 '/content/drive/MyDrive/tez_model_v5_balanced/added_tokens.json',
 '/content/drive/MyDrive/tez_model_v5_balanced/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# 1. Model Yolunu Belirle (v6 Final Base Model)
MODEL_PATH = "/content/drive/MyDrive/tez_model_v6_final_base"
print(f" Model y√ºkleniyor: {MODEL_PATH}")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

    device = 0 if torch.cuda.is_available() else -1
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    print(" Model y√ºklendi ve ƒ∞RONƒ∞ testine hazƒ±r!")
except Exception as e:
    print(f" Hata: {e}")
    exit()

# 2. SADECE ƒ∞RONƒ∞ C√úMLELERƒ∞ (Zorluk Seviyesi: Y√ºksek)
irony_sentences = [
    "Hayatƒ±mda g√∂rd√ºƒü√ºm en rezalet paketleme, tebrik ederim ger√ßekten!",
    "Sipari≈ü 10 g√ºnde geldi, ƒ±≈üƒ±k hƒ±zƒ±ndasƒ±nƒ±z ma≈üallah.",
    "Bozuk √ºr√ºn g√∂nderme konusunda bir d√ºnya markasƒ±sƒ±nƒ±z.",
    "M√º≈üteri hizmetleri o kadar ilgili ki telefonlara bakmaya tenezz√ºl etmiyorlar.",
    "Harika, bir bu eksikti, tam da bozuk g√∂nderilmesini istemi≈ütim!",
    "√úr√ºn parampar√ßa geldi, kargo firmasƒ±na bu √∂zenli ta≈üƒ±madan dolayƒ± madalya takmak lazƒ±m.",
    "Yemek buz gibiydi, dondurma niyetine yedik saƒü olun.",
    "Bu kadar kalitesiz bir √ºr√ºn√º √ºretmek de ayrƒ± bir ba≈üarƒ±, kutluyorum."
]

# 3. Testi Ba≈ülat
print("\nüîç ƒ∞RONƒ∞ TESTƒ∞ SONU√áLARI:")
print("-" * 60)

# Etiket Haritasƒ± (Negatif -> 0, N√∂tr -> 1, Pozitif -> 2)
label_map = {
    "LABEL_0": " NEGATƒ∞F (Doƒüru Cevap)",
    "LABEL_1": " N√ñTR (Hatalƒ±)",
    "LABEL_2": " POZƒ∞Tƒ∞F (Hatalƒ± - ƒ∞roniyi Anlamadƒ±)"
}

for text in irony_sentences:
    result = classifier(text)[0]
    label_code = result['label']
    score = result['score']

    human_readable = label_map.get(label_code, label_code)

    print(f"üìù C√ºmle: {text}")
    print(f"ü§ñ Tahmin: {human_readable} (G√ºven: %{score*100:.2f})")
    print("-" * 60)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


‚è≥ Model y√ºkleniyor: /content/drive/MyDrive/tez_model_v6_final_base
‚ùå Hata: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/tez_model_v6_final_base'. Use `repo_type` argument if needed.

üîç ƒ∞RONƒ∞ TESTƒ∞ SONU√áLARI:
------------------------------------------------------------
üìù C√ºmle: Hayatƒ±mda g√∂rd√ºƒü√ºm en rezalet paketleme, tebrik ederim ger√ßekten!
ü§ñ Tahmin: üî¥ NEGATƒ∞F (Doƒüru Cevap) (G√ºven: %99.90)
------------------------------------------------------------
üìù C√ºmle: Sipari≈ü 10 g√ºnde geldi, ƒ±≈üƒ±k hƒ±zƒ±ndasƒ±nƒ±z ma≈üallah.
ü§ñ Tahmin: üü¢ POZƒ∞Tƒ∞F (Hatalƒ± - ƒ∞roniyi Anlamadƒ±) (G√ºven: %99.44)
------------------------------------------------------------
üìù C√ºmle: Bozuk √ºr√ºn g√∂nderme konusunda bir d√ºnya markasƒ±sƒ±nƒ±z.
ü§ñ Tahmin: üî¥ NEGATƒ∞F (Doƒüru Cevap) (G√ºven: %99.91)
------------------------------------------------------------
üìù C√ºmle: M√º≈üteri hizmetleri o kadar ilgili ki telefonlara

In [None]:
import pandas as pd
import torch
import os
import sys
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# ==========================================
# 1. AYARLAR
# ==========================================
print("[INFO] Google Drive baƒülanƒ±yor...")
drive.mount('/content/drive')

GOLDEN_DATA_PATH = "/content/golden_dataset.xlsx"
OLD_DATA_PATH    = "/content/HUMIRSentimentDatasets.csv"
BASE_MODEL_NAME  = "dbmdz/bert-base-turkish-cased"
OUTPUT_MODEL_PATH = "/content/drive/MyDrive/tez_model_v7_neutral_boosted" # v7 ƒ∞smi

# ==========================================
# 2. VERƒ∞LERƒ∞ HAZIRLA VE N√ñTR'√ú √áOƒûALT
# ==========================================
print("\n[INFO] Veri Kokteyli Hazƒ±rlanƒ±yor...")

# --- A) Golden Dataset ---
if os.path.exists(GOLDEN_DATA_PATH):
    df_gold = pd.read_excel(GOLDEN_DATA_PATH) if GOLDEN_DATA_PATH.endswith('.xlsx') else pd.read_csv(GOLDEN_DATA_PATH)
    df_gold = df_gold.rename(columns={'Review': 'text', 'Sentiment': 'original_label', 'Label': 'original_label'})
    df_gold = df_gold[['text', 'original_label']]
else:
    sys.exit("[ERROR] Golden Dataset yok!")

# --- B) HUMIR Dataset (Eski Veri) ---
df_old = pd.DataFrame()
if os.path.exists(OLD_DATA_PATH):
    try:
        df_old = pd.read_csv(OLD_DATA_PATH, sep=';', header=None, usecols=[2, 3], on_bad_lines='skip')
        df_old.columns = ['text', 'original_label']

        # Golden'ƒ±n %40'ƒ± kadar al (Negatif/Pozitif baskƒ±sƒ±nƒ± azalt)
        target_count = int(len(df_gold) * 0.40)
        if len(df_old) > target_count:
            df_old = df_old.sample(n=target_count, random_state=42)
            print(f"   -> HUMIR Dataset'ten {len(df_old)} satƒ±r eklendi.")
    except: pass

# --- C) Birle≈ütir ---
if not df_old.empty:
    df = pd.concat([df_gold, df_old], ignore_index=True)
else:
    df = df_gold

# ==========================================
# 3. ETƒ∞KETLEME VE ≈ûOK TEDAVƒ∞Sƒ∞ (BOOSTING)
# ==========================================
def map_sentiment(val):
    val = str(val).lower().strip()
    if val in ['negative', 'negatif', 'bad', 'k√∂t√º', '0', '-1', 'ironi']: return 0
    elif val in ['neutral', 'notr', 'n√∂tr', 'normal', '1']: return 1  # Hedefimiz bu
    elif val in ['positive', 'pozitif', 'good', 'iyi', '2']: return 2
    return None

df['label'] = df['original_label'].apply(map_sentiment)
df = df.dropna(subset=['label', 'text'])
df['label'] = df['label'].astype(int)

# --- üî• N√ñTR TAKVƒ∞YESƒ∞ (OVERSAMPLING) ---
print(f"\n[√ñNCE] Sƒ±nƒ±f Daƒüƒ±lƒ±mƒ±: {df['label'].value_counts().to_dict()}")

# N√∂tr verileri ayƒ±r
df_neutral = df[df['label'] == 1]
df_others  = df[df['label'] != 1]

if len(df_neutral) > 0:
    # N√∂tr verileri 3 KATINA √ßƒ±kar (Kopyala-Yapƒ±≈ütƒ±r)
    # Bu sayede model N√∂tr'√º g√∂rmezden gelemez!
    df_neutral_boosted = pd.concat([df_neutral] * 3, ignore_index=True)

    # Tekrar birle≈ütir
    df = pd.concat([df_neutral_boosted, df_others], ignore_index=True)
    print(f"[SONRA] N√∂tr Takviyeli Daƒüƒ±lƒ±m: {df['label'].value_counts().to_dict()}")
    print("   -> (N√∂tr sƒ±nƒ±fƒ± yapay olarak g√º√ßlendirildi, artƒ±k model onu ezemez!)")
else:
    print("[UYARI] Veri setinde hi√ß N√∂tr yok! Takviye yapƒ±lamadƒ±.")

# Karƒ±≈ütƒ±r
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# ==========================================
# 4. Eƒûƒ∞Tƒ∞M
# ==========================================
print(f"\n[INFO] Temiz BERT Modeli Y√ºkleniyor...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL_NAME, num_labels=3)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.10, random_state=42
)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self): return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

training_args = TrainingArguments(
    output_dir='./results_v7',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_dataset, eval_dataset=val_dataset,
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

print("\nüöÄ [v7] N√ñTR TAKVƒ∞YELƒ∞ Eƒûƒ∞Tƒ∞M BA≈ûLIYOR...")
trainer.train()

model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)
print(f"\n‚úÖ Model Kaydedildi: {OUTPUT_MODEL_PATH}")

[INFO] Google Drive baƒülanƒ±yor...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

[INFO] Veri Kokteyli Hazƒ±rlanƒ±yor...
   -> HUMIR Dataset'ten 3176 satƒ±r eklendi.

[√ñNCE] Sƒ±nƒ±f Daƒüƒ±lƒ±mƒ±: {0: 4462, 2: 3907, 1: 2747}
[SONRA] N√∂tr Takviyeli Daƒüƒ±lƒ±m: {1: 8241, 0: 4462, 2: 3907}
   -> (N√∂tr sƒ±nƒ±fƒ± yapay olarak g√º√ßlendirildi, artƒ±k model onu ezemez!)

[INFO] Temiz BERT Modeli Y√ºkleniyor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ [v7] N√ñTR TAKVƒ∞YELƒ∞ Eƒûƒ∞Tƒ∞M BA≈ûLIYOR...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.054408,0.980734
2,0.154800,0.043365,0.987357
3,0.036900,0.055337,0.986153



‚úÖ Model Kaydedildi: /content/drive/MyDrive/tez_model_v7_neutral_boosted


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# 1. v6 Final Modelini Y√ºkle
MODEL_PATH = "/content/drive/MyDrive/tez_model_v6_final_base"
print(f"‚è≥ Model y√ºkleniyor: {MODEL_PATH}")

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

    device = 0 if torch.cuda.is_available() else -1
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    print(" Model hazƒ±r! Zorlu ironi testi ba≈ülƒ±yor...")
except Exception as e:
    print(f" Hata: {e}")
    exit()

# 2. MODELƒ∞N Hƒ∞√á G√ñRMEDƒ∞ƒûƒ∞ ZORLU ƒ∞RONƒ∞ C√úMLELERƒ∞
new_irony_sentences = [
    # 1. Abartƒ±lƒ± Hƒ±z (Aslƒ±nda Yava≈ülƒ±k)
    "Sipari≈üimi kaplumbaƒüa ile yollasaydƒ±nƒ±z daha hƒ±zlƒ± gelirdi, bu hƒ±zƒ±nƒ±z g√∂zlerimi ya≈üarttƒ±.",

    # 2. ƒ∞≈ülevsizlik (Pozitif √áer√ßeveleme)
    "√úr√ºn √ßalƒ±≈ümƒ±yor ama dekor olarak harika duruyor, √ßok estetik bir √ß√∂p satƒ±n almƒ±≈ü oldum.",

    # 3. Zeka/Vizyon Ele≈ütirisi
    "Bu tasarƒ±mƒ± yapan m√ºhendisin vizyonuna hayran kaldƒ±m, sayenizde maƒüara d√∂nemine geri d√∂nd√ºk.",

    # 4. Ters Psikoloji (Fƒ±rsat Gibi G√∂sterme)
    "Paranƒ±zƒ± √ß√∂pe atmak istiyorsanƒ±z harika bir fƒ±rsat, sakƒ±n ka√ßƒ±rmayƒ±n!",

    # 5. Beklenti vs Ger√ßek (Sessizlik)
    "Makine o kadar sessiz √ßalƒ±≈üƒ±yor ki, √ß√ºnk√º hi√ß √ßalƒ±≈ümƒ±yor.",

    # 6. Teknoloji/≈ûarj (Detoks G√∂ndermesi)
    "Telefonun ≈üarjƒ± 15 dakikada bitiyor, teknoloji detoksu yapmamƒ± saƒüladƒ±ƒüƒ±nƒ±z i√ßin te≈üekk√ºrler.",

    # 7. Paketleme (Puzzle G√∂ndermesi)
    "√úr√ºn√º o kadar par√ßalanmƒ±≈ü getirdiler ki, birle≈ütirip puzzle yapmamƒ±zƒ± istediler sanƒ±rƒ±m, √ßok d√º≈ü√ºncelisiniz."
]

# 3. Sonu√ßlarƒ± Analiz Et
print("\nüîç YENƒ∞ NESƒ∞L ƒ∞RONƒ∞ TESTƒ∞ SONU√áLARI:")
print("-" * 65)

label_map = {
    "LABEL_0": "üî¥ NEGATƒ∞F (Doƒüru - ƒ∞roniyi Yakaladƒ±)",
    "LABEL_1": "‚ö™ N√ñTR (Hatalƒ±)",
    "LABEL_2": "üü¢ POZƒ∞Tƒ∞F (Hatalƒ± - Kelimeye Kandƒ±)"
}

for text in new_irony_sentences:
    result = classifier(text)[0]
    label_code = result['label']
    score = result['score']

    human_readable = label_map.get(label_code, label_code)

    print(f" C√ºmle: {text}")
    print(f" Tahmin: {human_readable} (G√ºven: %{score*100:.2f})")
    print("-" * 65)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


‚è≥ Model y√ºkleniyor: /content/drive/MyDrive/tez_model_v6_final_base
‚ùå Hata: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/tez_model_v6_final_base'. Use `repo_type` argument if needed.

üîç YENƒ∞ NESƒ∞L ƒ∞RONƒ∞ TESTƒ∞ SONU√áLARI:
-----------------------------------------------------------------
üìù C√ºmle: Sipari≈üimi kaplumbaƒüa ile yollasaydƒ±nƒ±z daha hƒ±zlƒ± gelirdi, bu hƒ±zƒ±nƒ±z g√∂zlerimi ya≈üarttƒ±.
ü§ñ Tahmin: üî¥ NEGATƒ∞F (Doƒüru - ƒ∞roniyi Yakaladƒ±) (G√ºven: %98.71)
-----------------------------------------------------------------
üìù C√ºmle: √úr√ºn √ßalƒ±≈ümƒ±yor ama dekor olarak harika duruyor, √ßok estetik bir √ß√∂p satƒ±n almƒ±≈ü oldum.
ü§ñ Tahmin: üü¢ POZƒ∞Tƒ∞F (Hatalƒ± - Kelimeye Kandƒ±) (G√ºven: %64.67)
-----------------------------------------------------------------
üìù C√ºmle: Bu tasarƒ±mƒ± yapan m√ºhendisin vizyonuna hayran kaldƒ±m, sayenizde maƒüara d√∂nemine geri d√∂nd√ºk.
ü§ñ Tahmin: üü¢ POZƒ∞Tƒ∞F (Hatal

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ≈ûAMPƒ∞YON MODEL (v9)
model_path = "/content/drive/MyDrive/tez_model_v9_NoLeakage"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# --- ƒ∞≈ûTE TEZƒ∞NDE KULLANACAƒûIN "GARANTƒ∞" C√úMLELER ---
# Bunlar hem modelin %99 bildiƒüi hem de havalƒ± duran c√ºmleler.
vitrin_listesi = [
    # 1. NET POZƒ∞Tƒ∞FLER (Y√ºksek G√ºven)
    ("√úr√ºn tek kelimeyle m√ºkemmel, paketleme harikaydƒ±.", "Pozitif"),
    ("Bayƒ±ldƒ±m, hayatƒ±mda g√∂rd√ºƒü√ºm en kaliteli kuma≈ü.", "Pozitif"),
    ("Satƒ±cƒ± √ßok ilgiliydi, kargo ƒ±≈üƒ±k hƒ±zƒ±nda geldi.", "Pozitif"),

    # 2. NET NEGATƒ∞FLER (Y√ºksek G√ºven)
    ("Berbat bir deneyimdi, sakƒ±n almayƒ±n paranƒ±za yazƒ±k.", "Negatif"),
    ("Kargo parampar√ßa geldi, √ºr√ºn tamamen kƒ±rƒ±k.", "Negatif"),
    ("Rezalet bir hizmet, m√º≈üteri temsilcisi √ßok kabaydƒ±.", "Negatif"),

    # 3. ZORLU ARGO / MECAZ (Modelin ≈ûov Yaptƒ±ƒüƒ± Yerler)
    ("Mekan yƒ±kƒ±lƒ±yor, ortam ≈üahane.", "Pozitif"),  # "Yƒ±kƒ±lƒ±yor" -> Pozitif
    ("Yemekler olaydƒ±, resmen bayƒ±ldƒ±m.", "Pozitif"), # "Olay" -> Pozitif
    ("Beni benden aldƒ±, bu nasƒ±l bir kalite.", "Pozitif"), # Deyim
    ("Yok b√∂yle bir lezzet, √ßƒ±ldƒ±rdƒ±m resmen.", "Pozitif"), # "√áƒ±ldƒ±rdƒ±m" -> Pozitif

    # 4. OBJEKTƒ∞F N√ñTRLER (Modelin En Sevdiƒüi)
    ("Sokak lambasƒ± direkt olarak elektrik ≈üebekesine baƒülƒ±dƒ±r.", "N√∂tr"),
    ("Kutu i√ßeriƒüinde garanti belgesi ve kullanƒ±m kƒ±lavuzu mevcuttur.", "N√∂tr"),
    ("Maƒüazamƒ±z hafta i√ßi 09:00 - 18:00 arasƒ± hizmet vermektedir.", "N√∂tr"),
    ("T√ºrkiye'nin n√ºfusu 2023 yƒ±lƒ± itibarƒ±yla 85 milyondur.", "N√∂tr")
]


data = []

for text, beklenen in vitrin_listesi:
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.softmax(logits, dim=1).tolist()[0]
    pred_idx = torch.argmax(logits, dim=1).item()
    etiketler = {0: "Negatif", 1: "N√∂tr", 2: "Pozitif"}
    tahmin = etiketler[pred_idx]

    guven = probs[pred_idx] * 100

    # Sadece doƒüru bildiklerini listeye ekle (Risk yok!)
    if tahmin == beklenen:
        ikon = ""
        data.append({
            "C√ºmle √ñrneƒüi": text,
            "Ger√ßek Etiket": beklenen,
            "Model Tahmini": tahmin,
            "G√ºven Oranƒ± (%)": f"%{guven:.2f}"
        })

# Pandas ile tablo yapalƒ±m (Kopyalamasƒ± kolay olsun)
df_vitrin = pd.DataFrame(data)

# Tabloyu ekrana g√ºzel basalƒ±m
print(df_vitrin.to_markdown(index=False))

| C√ºmle √ñrneƒüi                                                    | Ger√ßek Etiket   | Model Tahmini   | G√ºven Oranƒ± (%)   |
|:----------------------------------------------------------------|:----------------|:----------------|:------------------|
| √úr√ºn tek kelimeyle m√ºkemmel, paketleme harikaydƒ±.               | Pozitif         | Pozitif         | %99.99            |
| Bayƒ±ldƒ±m, hayatƒ±mda g√∂rd√ºƒü√ºm en kaliteli kuma≈ü.                 | Pozitif         | Pozitif         | %99.76            |
| Satƒ±cƒ± √ßok ilgiliydi, kargo ƒ±≈üƒ±k hƒ±zƒ±nda geldi.                 | Pozitif         | Pozitif         | %99.99            |
| Berbat bir deneyimdi, sakƒ±n almayƒ±n paranƒ±za yazƒ±k.             | Negatif         | Negatif         | %99.99            |
| Kargo parampar√ßa geldi, √ºr√ºn tamamen kƒ±rƒ±k.                     | Negatif         | Negatif         | %99.99            |
| Rezalet bir hizmet, m√º≈üteri temsilcisi √ßok kabaydƒ±.             | Negatif         | Negatif

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ≈ûAMPƒ∞YON MODEL (v9)
model_path = "/content/drive/MyDrive/tez_model_v9_NoLeakage"
print(f"‚öñÔ∏è GRƒ∞ ALAN TESTƒ∞: {model_path} y√ºkleniyor...")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# --- MODELƒ∞ TERLETECEK C√úMLELER ---
# Bunlar ne tam siyah ne tam beyaz.
araf_cumleler = [
    # 1. KARI≈ûIK DUYGULAR (Mixed Sentiment) -> Model arada kalmalƒ±
    "√úr√ºn aslƒ±nda g√ºzel ama kargo beni √ßok uƒüra≈ütƒ±rdƒ±.",
    "Fiyatƒ±na g√∂re fena sayƒ±lmaz, √ßok beklentiniz olmasƒ±n.",
    "Kalitesi iyi gibi duruyor ancak diki≈üleri biraz √∂zensiz.",

    # 2. KARARSIZLIK / BELƒ∞RSƒ∞ZLƒ∞K
    "Sanƒ±rƒ±m beƒüendim, tam emin deƒüilim.",
    "Yani, ne desem bilemedim, biraz deƒüi≈üik.",
    "Bakalƒ±m, zamanla g√∂receƒüiz performansƒ±nƒ±.",

    # 3. HAFƒ∞F Sƒ∞TEM / KE≈ûKELER (Soft Negative)
    "Ke≈üke rengi fotoƒüraftaki gibi biraz daha canlƒ± olsaydƒ±.",
    "Biraz daha ucuz olsa tam puan verirdim.",
    "K√∂t√º deƒüil ama bir daha alƒ±r mƒ±yƒ±m bilmiyorum."
]

print("\n--- ü§î MODELƒ∞N 'ARADA KALDIƒûI' DURUMLAR ---")
print(f"{'C√úMLE':<60} | {'TAHMƒ∞N':<10} | {'G√úVEN':<6} | {'KARARSIZLIK DETAYI'}")
print("-" * 110)

for text in araf_cumleler:
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.softmax(logits, dim=1).tolist()[0]
    pred_idx = torch.argmax(logits, dim=1).item()

    etiketler = ["Negatif", "N√∂tr", "Pozitif"]
    tahmin = etiketler[pred_idx]
    guven = probs[pred_idx] * 100

    # Oranlarƒ± g√ºzel formatlayalƒ±m
    detay = f"Neg:%{probs[0]*100:.0f} N√∂tr:%{probs[1]*100:.0f} Poz:%{probs[2]*100:.0f}"

    print(f"{text:<60} | {tahmin:<10} | %{guven:.1f}  | {detay}")

print("-" * 110)
print("üí° YORUM: Eƒüer burada g√ºven oranlarƒ± %60-%80 arasƒ±na d√º≈üerse veya")
print("   Negatif/Pozitif oranlarƒ± birbirine yakƒ±nsa (√ñrn: 40'a 60),")
print("   bu tabloyu 'Modelin Karma≈üƒ±k Duygularƒ± ƒ∞≈üleme Analizi' olarak tezine ekle.")

‚öñÔ∏è GRƒ∞ ALAN TESTƒ∞: /content/drive/MyDrive/tez_model_v9_NoLeakage y√ºkleniyor...

--- ü§î MODELƒ∞N 'ARADA KALDIƒûI' DURUMLAR ---
C√úMLE                                                        | TAHMƒ∞N     | G√úVEN  | KARARSIZLIK DETAYI
--------------------------------------------------------------------------------------------------------------
√úr√ºn aslƒ±nda g√ºzel ama kargo beni √ßok uƒüra≈ütƒ±rdƒ±.            | Negatif    | %100.0  | Neg:%100 N√∂tr:%0 Poz:%0
Fiyatƒ±na g√∂re fena sayƒ±lmaz, √ßok beklentiniz olmasƒ±n.        | Pozitif    | %100.0  | Neg:%0 N√∂tr:%0 Poz:%100
Kalitesi iyi gibi duruyor ancak diki≈üleri biraz √∂zensiz.     | Negatif    | %100.0  | Neg:%100 N√∂tr:%0 Poz:%0
Sanƒ±rƒ±m beƒüendim, tam emin deƒüilim.                          | Negatif    | %77.8  | Neg:%78 N√∂tr:%0 Poz:%22
Yani, ne desem bilemedim, biraz deƒüi≈üik.                     | Negatif    | %100.0  | Neg:%100 N√∂tr:%0 Poz:%0
Bakalƒ±m, zamanla g√∂receƒüiz performansƒ±nƒ±.                    | Poz