In [None]:
from transformers import pipeline
import torch
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

device = 0 if torch.cuda.is_available() else -1

mask_filler = pipeline("fill-mask", model="dbmdz/bert-base-turkish-cased", device=device)


def augment_text_with_berturk(text, num_variations=3):
    augmented_texts = set()  

    while len(augmented_texts) < num_variations:
        words = text.split()  
        x = max(1, len(words))

        for i in range(x):
            masked_text = words.copy()
            
            mask_index = random.randint(0, len(masked_text) - 1)
            masked_text[mask_index] = "[MASK]"
            masked_sentence = " ".join(masked_text)
            
            predictions = mask_filler(masked_sentence)
            
            if isinstance(predictions, list) and len(predictions) > 0:
                pred = predictions[0]
                if isinstance(pred, dict):
                    token_str = pred.get('token_str', '').strip()
                    if token_str:
                        words[mask_index] = token_str
            else:
                print("⚠️ Beklenmeyen çıktı formatı:", predictions)
                break
        
        new_text = " ".join(words)
        augmented_texts.add(new_text)
    
    return list(augmented_texts)


original_text = """
Ayaküstü üye kazandılar Saadet Partisi il Başkan Yardımcısı Salih Kocatepe, 
iznik te esnaf ziyareti sırasında iznik esnaflarından partiye üye olmak isteyen 
Osman Yıldız adlı vatandaşa kendi rozetini çıkartarak taktı. Üye olan Osman Yıldız 
Saadet Partisini yakından takip ediyorum. Geriye dönük Rahmetli Erbakan Hocaya duyduğum 
sevgi ve saygımdan dolayı ülkemiz menfaatlerini düşünerek işçi memur haklarını tam hakkı 
ile savunarak mili görüş çatısı altında buluşturdu. Bende milli görüş davasında çalışacağım...
"""

augmented_versions = augment_text_with_berturk(original_text, num_variations=3)


def calculate_similarity(original_text, augmented_texts):
    texts = [original_text] + augmented_texts
    vectorizer = CountVectorizer().fit_transform(texts)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    
    similarities = []
    for i in range(1, len(cosine_matrix)):
        similarities.append(cosine_matrix[0][i])
    
    return similarities

similarity_scores = calculate_similarity(original_text, augmented_versions)

print("🎯 **Oluşturulan Varyasyonlar ve Benzerlik Skorları:**")
for i, (text, score) in enumerate(zip(augmented_versions, similarity_scores)):
    print(f"\n📝 **Varyasyon {i+1}:** {text}")
    print(f"🔗 **Benzerlik Skoru:** {score:.4f}")

In [None]:
###llama3-8b-8192

GROQ_API_KEY = "GROQ_API_KEY"
import os

from groq import Groq

client = Groq(
    api_key=GROQ_API_KEY,
)
original_text = """
Ayaküstü üye kazandılar Saadet Partisi il Başkan Yardımcısı Salih Kocatepe, 
iznik te esnaf ziyareti sırasında iznik esnaflarından partiye üye olmak isteyen 
Osman Yıldız adlı vatandaşa kendi rozetini çıkartarak taktı. Üye olan Osman Yıldız 
Saadet Partisini yakından takip ediyorum. Geriye dönük Rahmetli Erbakan Hocaya duyduğum 
sevgi ve saygımdan dolayı ülkemiz menfaatlerini düşünerek işçi memur haklarını tam hakkı 
ile savunarak mili görüş çatısı altında buluşturdu. Bende milli görüş davasında çalışacağım...
""" 

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": f"Bu metni anlamını kaybetmeden yeniden Türkçe yaz: {original_text}",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)


In [None]:
###Mixtral-8x7b-32768

GROQ_API_KEY = "GROQ_API_KEY"
import os

from groq import Groq

client = Groq(
    api_key=GROQ_API_KEY,
)
original_text = """
Ayaküstü üye kazandılar Saadet Partisi il Başkan Yardımcısı Salih Kocatepe, 
iznik te esnaf ziyareti sırasında iznik esnaflarından partiye üye olmak isteyen 
Osman Yıldız adlı vatandaşa kendi rozetini çıkartarak taktı. Üye olan Osman Yıldız 
Saadet Partisini yakından takip ediyorum. Geriye dönük Rahmetli Erbakan Hocaya duyduğum 
sevgi ve saygımdan dolayı ülkemiz menfaatlerini düşünerek işçi memur haklarını tam hakkı 
ile savunarak mili görüş çatısı altında buluşturdu. Bende milli görüş davasında çalışacağım...
""" 

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": f"Bu metni anlamını kaybetmeden yeniden Türkçe yaz: {original_text}",
        }
    ],
    model="mixtral-8x7b-32768",
)

print(chat_completion.choices[0].message.content)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "ytu-ce-cosmos/turkish-gpt2-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_text = """
Ayaküstü üye kazandılar Saadet Partisi il Başkan Yardımcısı Salih Kocatepe, 
iznik te esnaf ziyareti sırasında iznik esnaflarından partiye üye olmak isteyen 
Osman Yıldız adlı vatandaşa kendi rozetini çıkartarak taktı. Üye olan Osman Yıldız 
Saadet Partisini yakından takip ediyorum. Geriye dönük Rahmetli Erbakan Hocaya duyduğum 
sevgi ve saygımdan dolayı ülkemiz menfaatlerini düşünerek işçi memur haklarını tam hakkı 
ile savunarak mili görüş çatısı altında buluşturdu. Bende milli görüş davasında çalışacağım...
""" 

prompt = f"Aşağıdaki cümleyi anlamını koruyarak yeniden ifade et:\n{input_text}\nYeniden ifade edilmiş hali:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = model.generate(
    **inputs,
    max_length=256,
    num_return_sequences=1,
    temperature=0.6,  # Çıktı çeşitliliği
    top_p=0.9,        # Olasılığı yüksek token'lara odaklanma
    do_sample=True,   # Rastgele seçim etkinleştir
    pad_token_id=tokenizer.eos_token_id
)

augmented_text = tokenizer.decode(output[0], skip_special_tokens=True).split("Yeniden ifade edilmiş hali:")[-1].strip()

print("Orijinal Metin:", input_text)
print("Augmente Edilmiş Metin:", augmented_text)


-----

In [1]:
import kagglehub
news_dataset = kagglehub.dataset_download('busragural/news-dataset')

In [2]:
import pandas as pd
import torch
import numpy as np
import os
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from collections import Counter
import time
import random
from transformers import pipeline
from xgboost import XGBClassifier

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
'''
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()
'''

In [None]:
dataset_path = os.path.join(news_dataset, '/kaggle/input/news-dataset/interpress_news_category_tr_270k_train.tsv') 
df = pd.read_csv(dataset_path, sep = '\t')
df.head()

In [None]:
df['CategoryCode'].value_counts()


In [7]:
df = df.dropna(subset=['Content'])

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

def truncate_texts_in_dataframe(df, text_column, tokenizer, max_length=512):
    df[text_column] = df[text_column].apply(lambda x: tokenizer.decode(tokenizer(x, truncation=True, max_length=max_length)['input_ids'], skip_special_tokens=True))
    return df

train_df = df.groupby('CategoryCode').apply(lambda x: x.sample(n=20, random_state=1)).reset_index(drop=True)
remaining_df = pd.concat([df, train_df]).drop_duplicates(keep=False)
test_df = remaining_df.groupby('CategoryCode').apply(lambda x: x.sample(n=20, random_state=1)).reset_index(drop=True)

train_df = truncate_texts_in_dataframe(train_df, 'Content', tokenizer)
test_df = truncate_texts_in_dataframe(test_df, 'Content', tokenizer)

print(f"✅ Train set: {len(train_df)} sample")
print(f"✅ Test set: {len(test_df)} sample")

train_df.to_csv("trainSet.csv", index=False)
test_df.to_csv("testSet.csv", index=False)


In [9]:
embedding_models = {
    "minilm": "sentence-transformers/all-MiniLM-L12-v2",
    #"jina": "jinaai/jina-embeddings-v3",
    #"bge": "BAAI/bge-m3"
    }
classifiers = {
    #"SVM": SVC(),
    #"RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(
        n_estimators=100,   # Ağaç sayısı
        max_depth=6,        # Maksimum derinlik
        learning_rate=0.1,  # Öğrenme oranı
        subsample=0.8,      # Rastgele örnekleme oranı
        colsample_bytree=0.8,  # Her ağaç için rastgele özellik oranı
        use_label_encoder=False,
        eval_metric="mlogloss"
    )
    #"MLP": MLPClassifier(max_iter=500)
}

In [10]:
#rm -rf /kaggle/working

In [11]:
def get_embeddings(texts, model_name, batch_size=32, max_length=512):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(
            list(batch_texts),
            padding=True,
            truncation=True,
            max_length=max_length,
            add_special_tokens=True,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().type(torch.float32).numpy())
    
    return np.vstack(embeddings)

def get_embeddings_and_save(texts, model_name, output_file, batch_size=32):
    embeddings = get_embeddings(texts, model_name, batch_size=batch_size)
    np.save(output_file, embeddings)
    print(f"Embeddings are saved to {output_file}.")
    return embeddings

In [None]:
for model_key, model_name in embedding_models.items():
    print(f"\nCreating embeddings for Model: {model_name}..")
    
    model_folder = f"embeddings/{model_key}"
    os.makedirs(f"{model_folder}/train", exist_ok=True)
    os.makedirs(f"{model_folder}/test", exist_ok=True)
    
    # Train Embeddings
    for topic in train_df['CategoryCode'].unique():
        train_embeddings_file = os.path.join(model_folder, "train", f"embeddings_{topic}.npy")
        if not os.path.exists(train_embeddings_file):
            topic_texts = train_df[train_df['CategoryCode'] == topic]['Content']
            get_embeddings_and_save(topic_texts, model_name, train_embeddings_file)
        else:
            print(f"ℹ️ {train_embeddings_file} already existed.")
    
    # Test Embeddings
    for topic in test_df['CategoryCode'].unique():
        test_embeddings_file = os.path.join(model_folder, "test", f"embeddings_{topic}.npy")
        if not os.path.exists(test_embeddings_file):
            topic_texts = test_df[test_df['CategoryCode'] == topic]['Content']
            get_embeddings_and_save(topic_texts, model_name, test_embeddings_file)
        else:
            print(f"ℹ️ {test_embeddings_file} already existed.")

In [None]:
#data_sizes = [850,680,510,340,170,85]
data_sizes = [340, 204,136, 68]
test_embeddings = []
test_labels = []

for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"/kaggle/working/embeddings/minilm/test/embeddings_{topic}.npy")
    test_embeddings.append(embeddings)
    test_labels += [topic] * len(embeddings)

test_embeddings = np.vstack(test_embeddings)
test_labels = np.array(test_labels)

# Training
results = {size: [] for size in data_sizes}

for size in data_sizes:
    print(f"\nData size: {size}")
    train_embeddings = []
    train_labels = []
    
    for topic in train_df['CategoryCode'].unique():
        embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(embeddings)
        train_labels += [topic] * len(embeddings)
    
    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)
    
    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        predictions = clf.predict(test_embeddings)
        accuracy = accuracy_score(test_labels, predictions)
        f1 = f1_score(test_labels, predictions, average='weighted')
        print(f"🔄 {clf_name}: Accuracy={accuracy:.4f}, F1={f1:.4f}")

        results[size].append({
            "classifier": clf_name,
            "accuracy": accuracy,
            "f1_score": f1
        })


In [None]:
def plot_accuracy_vs_data_size(results):
    plt.figure(figsize=(12, 8))
    
    for clf_name in classifiers.keys():
        accuracies = []
        for size in data_sizes:
            clf_result = next((res for res in results[size] if res['classifier'] == clf_name), None)
            if clf_result:
                accuracies.append(clf_result['accuracy'])
            else:
                accuracies.append(None)
        
        plt.plot(data_sizes, accuracies, marker='o', linestyle='-', label=f"{clf_name}")
        
        for i, acc in enumerate(accuracies):
            if acc is not None:
                plt.text(data_sizes[i], acc, f"{acc:.2f}", 
                         fontsize=8, color='black', ha='center', va='bottom')
    
    plt.title('Effect of Training Dataset Sizes on Test Accuracy')
    plt.xlabel('Dataset Size')
    plt.ylabel('Accuracy')
    plt.xticks(data_sizes)
    plt.legend(title='Classifiers')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()


plot_accuracy_vs_data_size(results)

In [None]:
'''
# DBMDZ BERT Pipeline
mask_filler = pipeline("fill-mask", model="dbmdz/bert-base-turkish-cased", device=device)

device = 0 if torch.cuda.is_available() else -1

def augment_text_with_berturk(text, num_variations=3, max_length=512):
    augmented_texts = set()
    tokenizer = mask_filler.tokenizer  # Mask filler'dan tokenizer alın
    text = truncate_text(text, tokenizer, max_length=max_length)  # Metni kısalt

    while len(augmented_texts) < num_variations:
        words = text.split() 
        x = max(1, len(words))

        for i in range(x):
            masked_text = words.copy()
            mask_index = random.randint(0, len(masked_text) - 1)
            masked_text[mask_index] = "[MASK]"
            masked_sentence = " ".join(masked_text)
            
            # Uzunluk kontrolü
            if len(tokenizer(masked_sentence)['input_ids']) > 512:
                print("⚠️ Masked sentence too long, skipping:", masked_sentence)
                continue

            
            predictions = mask_filler(masked_sentence)
            if isinstance(predictions, list) and len(predictions) > 0:
                pred = predictions[0]  # İlk tahmini al
                if isinstance(pred, dict):
                    token_str = pred.get('token_str', '').strip()
                    if token_str:
                        words[mask_index] = token_str
            else:
                print("⚠️ Beklenmeyen çıktı formatı:", predictions)
                break
        
        new_text = " ".join(words)
        augmented_texts.add(new_text)
    
    return list(augmented_texts)
def truncate_text(text, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(text)
    if len(tokens) > max_length:
        truncated_tokens = tokens[:max_length]
        truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)
        return truncated_text
    return text
'''

In [16]:
def majority_voting(original_pred, aug_preds):
    votes = aug_preds + [original_pred]
    return max(set(votes), key=votes.count)

In [None]:
from transformers import pipeline
import random

def lowercase_turkish(text):
    """
    Converts text to lowercase for Turkish language, handling special case for 'I'.
    """
    return text.replace("I", "ı").lower()

device = 0 if torch.cuda.is_available() else -1

mask_filler = pipeline("fill-mask", model="ytu-ce-cosmos/turkish-base-bert-uncased", device=device)

def augment_text_with_berturk(text, num_variations=3, max_length=512):
    augmented_texts = set()
    tokenizer = mask_filler.tokenizer 
    text = lowercase_turkish(text)  
    text = truncate_text(text, tokenizer, max_length=max_length)  

    while len(augmented_texts) < num_variations:
        words = text.split() 
        x = max(1, len(words)// 3)

        for i in range(x):
            masked_text = words.copy()
            mask_index = random.randint(0, len(masked_text) - 1)
            masked_text[mask_index] = "[MASK]"
            masked_sentence = " ".join(masked_text)
            
            if len(tokenizer(masked_sentence)['input_ids']) > 512:
                print("⚠️ Masked sentence too long, skipping:", masked_sentence)
                continue

            predictions = mask_filler(masked_sentence)
            if isinstance(predictions, list) and len(predictions) > 0:
                pred = predictions[0]  
                if isinstance(pred, dict):
                    token_str = pred.get('token_str', '').strip()
                    if token_str:
                        words[mask_index] = token_str
            else:
                print("⚠️ Beklenmeyen çıktı formatı:", predictions)
                break
        
        new_text = " ".join(words)
        augmented_texts.add(new_text)
    
    return list(augmented_texts)

def truncate_text(text, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(text)
    if len(tokens) > max_length:
        truncated_tokens = tokens[:max_length]
        truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)
        return truncated_text
    return text
    

In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_3 = augment_text_with_berturk(text, num_variations=3)
        print("3 Augmentation yapıldı.")
        
        aug_texts_3_file = f"augmented_texts/bert/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True)
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Augmented texts saved to {aug_texts_3_file}.")

        aug_embeddings_3 = get_embeddings(aug_texts_3, embedding_models['minilm'])
        print("Yeni 3 embeddingler.")

        aug_embeddings_3_file = f"augmented_embeddings/bert/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Augmented embeddings saved to {aug_embeddings_3_file}.")

In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_berturk(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/bert/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")

        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("Yeni 5 embeddingler.")

        aug_embeddings_5_file = f"augmented_embeddings/bert/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")

In [None]:
import random
import os

for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_berturk(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/bert/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")
        
        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("5 Embedding oluşturuldu.")
        
        aug_embeddings_5_file = f"augmented_embeddings/bert/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")
        
        selected_indices = random.sample(range(5), 3)
        aug_texts_3 = [aug_texts_5[i] for i in selected_indices]
        aug_embeddings_3 = [aug_embeddings_5[i] for i in selected_indices]
        
        aug_texts_3_file = f"augmented_texts/bert/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True)  
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Selected augmented texts saved to {aug_texts_3_file}.")
        
        aug_embeddings_3_file = f"augmented_embeddings/bert/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)  
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Selected augmented embeddings saved to {aug_embeddings_3_file}.")


In [None]:
#data_sizes = [850, 680, 510, 340, 170, 85]

results = {size: [] for size in data_sizes}

for size in data_sizes:
    print(f"\nTraining models with data size: {size}")

    train_embeddings = []
    train_labels = []
    for topic in train_df['CategoryCode'].unique():
        topic_embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(topic_embeddings)
        train_labels += [topic] * len(topic_embeddings)

    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)

    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        print(f"🔄 {clf_name} trained with data size {size}.")

        original_test_predictions = []
        augmented_test_predictions_5 = []
        augmented_test_predictions_3 = []

        for topic in test_df['CategoryCode'].unique():
            embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
            texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
            for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
                original_pred = clf.predict([embedding])[0]
                original_test_predictions.append(original_pred)

                aug_embeddings_5_file = f"augmented_embeddings/bert/5/{topic}_embedding_{idx}.npy"
                aug_embeddings_5 = np.load(aug_embeddings_5_file)

                aug_embeddings_3_file = f"augmented_embeddings/bert/3/{topic}_embedding_{idx}.npy"
                aug_embeddings_3 = np.load(aug_embeddings_3_file)

                aug_preds_5 = clf.predict(aug_embeddings_5)

                aug_preds_3 = clf.predict(aug_embeddings_3)

                final_pred_5 = majority_voting(original_pred, aug_preds_5.tolist())
                augmented_test_predictions_5.append(final_pred_5)

                final_pred_3 = majority_voting(original_pred, aug_preds_3.tolist())
                augmented_test_predictions_3.append(final_pred_3)

        original_accuracy = accuracy_score(test_labels, original_test_predictions)
        augmented_accuracy_5 = accuracy_score(test_labels, augmented_test_predictions_5)
        augmented_accuracy_3 = accuracy_score(test_labels, augmented_test_predictions_3)

        results[size].append({
            "classifier": clf_name,
            "original_accuracy": original_accuracy,
            "augmented_accuracy_5": augmented_accuracy_5,
            "augmented_accuracy_3": augmented_accuracy_3
        })
        print(f"✅ {clf_name} - Original Accuracy: {original_accuracy:.4f}, Augmented Accuracy (5): {augmented_accuracy_5:.4f}, Augmented Accuracy (3): {augmented_accuracy_3:.4f}")

plt.figure(figsize=(12, 8))
for clf_name in classifiers.keys():
    original_accuracies = [next((res['original_accuracy'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_5 = [next((res['augmented_accuracy_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_3 = [next((res['augmented_accuracy_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]

    plt.plot(data_sizes, original_accuracies, marker='o', linestyle='-', label=f"{clf_name} - Original")
    plt.plot(data_sizes, augmented_accuracies_5, marker='x', linestyle='--', label=f"{clf_name} - Augmented (5)")
    plt.plot(data_sizes, augmented_accuracies_3, marker='s', linestyle='-.', label=f"{clf_name} - Augmented (3)")

plt.title('Performance Comparison: Original vs Augmented Data')
plt.xlabel('Training Data Size')
plt.ylabel('Accuracy')
plt.xticks(data_sizes)
plt.legend(title='Classifiers')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

----

In [None]:
!pip install groq

In [21]:
###llama3-8b-8192
import os
from groq import Groq

GROQ_API_KEY = ""

def augment_text_with_llama(text, num_variations=3):
    client = Groq(api_key=GROQ_API_KEY)
    augmented_texts = set()

    while len(augmented_texts) < num_variations:
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": f"Bu metni anlamını kaybetmeden yeniden Türkçe yaz ve yalnızca yeni metni yaz: {text}",
                    }
                ],
                model="llama3-8b-8192",
            )
            generated_text = chat_completion.choices[0].message.content.strip()
            augmented_texts.add(generated_text)
        except Exception as e:
            print(f"⚠️ Error during augmentation: {e}")

    return list(augmented_texts)



In [None]:
aug_texts_3 = augment_text_with_llama(test_df.iloc[0]['Content'], num_variations=3)
aug_texts_3

In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_3 = augment_text_with_llama(text, num_variations=3)
        print("3 Augmentation yapıldı.")
        
        aug_texts_3_file = f"augmented_texts/llama/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True)
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Augmented texts saved to {aug_texts_3_file}.")

        aug_embeddings_3 = get_embeddings(aug_texts_3, embedding_models['minilm'])
        print("Yeni 3 embeddingler.")

        aug_embeddings_3_file = f"augmented_embeddings/llama/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Augmented embeddings saved to {aug_embeddings_3_file}.")

In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_llama(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/llama/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")

        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("Yeni 5 embeddingler.")

        aug_embeddings_5_file = f"augmented_embeddings/llama/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")

In [None]:
import random
import os

for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_llama(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/llama/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")
        
        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("5 Embedding oluşturuldu.")
        
        aug_embeddings_5_file = f"augmented_embeddings/llama/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")
        
        selected_indices = random.sample(range(5), 3)
        aug_texts_3 = [aug_texts_5[i] for i in selected_indices]
        aug_embeddings_3 = [aug_embeddings_5[i] for i in selected_indices]
        
        aug_texts_3_file = f"augmented_texts/llama/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True) 
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Selected augmented texts saved to {aug_texts_3_file}.")
        
        aug_embeddings_3_file = f"augmented_embeddings/llama/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)  
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Selected augmented embeddings saved to {aug_embeddings_3_file}.")


In [None]:
#data_sizes = [850, 680, 510, 340, 170, 85]

results = {size: [] for size in data_sizes}

for size in data_sizes:
    print(f"\nTraining models with data size: {size}")

    train_embeddings = []
    train_labels = []
    for topic in train_df['CategoryCode'].unique():
        topic_embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(topic_embeddings)
        train_labels += [topic] * len(topic_embeddings)

    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)

    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        print(f"🔄 {clf_name} trained with data size {size}.")

        original_test_predictions = []
        augmented_test_predictions_5 = []
        augmented_test_predictions_3 = []

        for topic in test_df['CategoryCode'].unique():
            embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
            texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
            for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
                original_pred = clf.predict([embedding])[0]
                original_test_predictions.append(original_pred)

                aug_embeddings_5_file = f"augmented_embeddings/llama/5/{topic}_embedding_{idx}.npy"
                aug_embeddings_5 = np.load(aug_embeddings_5_file)

                aug_embeddings_3_file = f"augmented_embeddings/llama/3/{topic}_embedding_{idx}.npy"
                aug_embeddings_3 = np.load(aug_embeddings_3_file)

                aug_preds_5 = clf.predict(aug_embeddings_5)

                aug_preds_3 = clf.predict(aug_embeddings_3)

                final_pred_5 = majority_voting(original_pred, aug_preds_5.tolist())
                augmented_test_predictions_5.append(final_pred_5)

                final_pred_3 = majority_voting(original_pred, aug_preds_3.tolist())
                augmented_test_predictions_3.append(final_pred_3)

        original_accuracy = accuracy_score(test_labels, original_test_predictions)
        augmented_accuracy_5 = accuracy_score(test_labels, augmented_test_predictions_5)
        augmented_accuracy_3 = accuracy_score(test_labels, augmented_test_predictions_3)

        results[size].append({
            "classifier": clf_name,
            "original_accuracy": original_accuracy,
            "augmented_accuracy_5": augmented_accuracy_5,
            "augmented_accuracy_3": augmented_accuracy_3
        })
        print(f"✅ {clf_name} - Original Accuracy: {original_accuracy:.4f}, Augmented Accuracy (5): {augmented_accuracy_5:.4f}, Augmented Accuracy (3): {augmented_accuracy_3:.4f}")

plt.figure(figsize=(12, 8))
for clf_name in classifiers.keys():
    original_accuracies = [next((res['original_accuracy'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_5 = [next((res['augmented_accuracy_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_3 = [next((res['augmented_accuracy_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]

    plt.plot(data_sizes, original_accuracies, marker='o', linestyle='-', label=f"{clf_name} - Original")
    plt.plot(data_sizes, augmented_accuracies_5, marker='x', linestyle='--', label=f"{clf_name} - Augmented (5)")
    plt.plot(data_sizes, augmented_accuracies_3, marker='s', linestyle='-.', label=f"{clf_name} - Augmented (3)")

plt.title('Performance Comparison: Original vs Augmented Data')
plt.xlabel('Training Data Size')
plt.ylabel('Accuracy')
plt.xticks(data_sizes)
plt.legend(title='Classifiers')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

-----

In [None]:
'''
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name="ytu-ce-cosmos/turkish-gpt2-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
 # GPU desteği varsa modele taşı
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def augment_text_with_gpt2(input_text, num_variations=3 ):
    augmented_texts = set()

    while len(augmented_texts) < num_variations:
        # Augmentation için prompt hazırla
        prompt = f"Aşağıdaki cümleyi anlamını koruyarak yeniden ifade et:\n{input_text}\nYeniden ifade edilmiş hali:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Modelden çıktı üret
        output = model.generate(
            **inputs,
            max_new_tokens=512,
            num_return_sequences=1,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        # Çıktıyı çözümle ve ekle
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True).split("Yeniden ifade edilmiş hali:")[-1].strip()
        augmented_texts.add(generated_text)

    return list(augmented_texts)
        

# Örnek kullanım
original_text = """
Ayaküstü üye kazandılar Saadet Partisi il Başkan Yardımcısı Salih Kocatepe, 
iznik te esnaf ziyareti sırasında iznik esnaflarından partiye üye olmak isteyen 
Osman Yıldız adlı vatandaşa kendi rozetini çıkartarak taktı. Üye olan Osman Yıldız 
Saadet Partisini yakından takip ediyorum. Geriye dönük Rahmetli Erbakan Hocaya duyduğum 
sevgi ve saygımdan dolayı ülkemiz menfaatlerini düşünerek işçi memur haklarını tam hakkı 
ile savunarak mili görüş çatısı altında buluşturdu. Bende milli görüş davasında çalışacağım...
"""

augmented_texts = augment_text_with_gpt2(original_text, num_variations=3)
for idx, text in enumerate(augmented_texts, start=1):
    print(f"Varyasyon {idx}: {text}\n")

'''

In [None]:
aug_texts_3 = augment_text_with_gpt2(test_df.iloc[0]['Content'], num_variations=3)
aug_texts_3

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "ytu-ce-cosmos/turkish-gpt2-medium-350m-instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def augment_text_with_gpt2(input_text, num_variations=3):
    augmented_texts = set()
    max_length = model.config.max_position_embeddings
    while len(augmented_texts) < num_variations:
        prompt = f"Aşağıdaki cümleyi anlamını koruyarak yeniden ifade et:\n{input_text}\nYeniden ifade edilmiş hali:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        output = model.generate(
            **inputs,
            max_new_tokens=512,
            num_return_sequences=1,
            temperature=0.6,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True).split("Yeniden ifade edilmiş hali:")[-1].strip()
        augmented_texts.add(generated_text)

    return list(augmented_texts)

aug_texts_3 = augment_text_with_gpt2(test_df.iloc[2]['Content'], num_variations=3)
aug_texts_3


In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_3 = augment_text_with_gpt2(text, num_variations=3)
        print("3 Augmentation yapıldı.")
        
        aug_texts_3_file = f"augmented_texts/gpt2/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True)
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Augmented texts saved to {aug_texts_3_file}.")

        aug_embeddings_3 = get_embeddings(aug_texts_3, embedding_models['minilm'])
        print("Yeni 3 embeddingler.")

        aug_embeddings_3_file = f"augmented_embeddings/gpt2/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Augmented embeddings saved to {aug_embeddings_3_file}.")

In [None]:
for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_gpt2(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/gpt2/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")

        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("Yeni 5 embeddingler.")

        aug_embeddings_5_file = f"augmented_embeddings/gpt2/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")

In [None]:
import random
import os

for topic in test_df['CategoryCode'].unique():
    embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
    texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
    
    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
        aug_texts_5 = augment_text_with_gpt2(text, num_variations=5)
        print("5 Augmentation yapıldı.")
        
        aug_texts_5_file = f"augmented_texts/gpt2/5/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_5_file), exist_ok=True)
        np.save(aug_texts_5_file, aug_texts_5)
        print(f"5 Augmented texts saved to {aug_texts_5_file}.")
        
        aug_embeddings_5 = get_embeddings(aug_texts_5, embedding_models['minilm'])
        print("5 Embedding oluşturuldu.")
        
        aug_embeddings_5_file = f"augmented_embeddings/gpt2/5/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_5_file), exist_ok=True)
        np.save(aug_embeddings_5_file, aug_embeddings_5)
        print(f"5 Augmented embeddings saved to {aug_embeddings_5_file}.")
        
        selected_indices = random.sample(range(5), 3)
        aug_texts_3 = [aug_texts_5[i] for i in selected_indices]
        aug_embeddings_3 = [aug_embeddings_5[i] for i in selected_indices]
        
        aug_texts_3_file = f"augmented_texts/gpt2/3/{topic}_text_{idx}.npy"
        os.makedirs(os.path.dirname(aug_texts_3_file), exist_ok=True)  # Dizini oluştur
        np.save(aug_texts_3_file, aug_texts_3)
        print(f"3 Selected augmented texts saved to {aug_texts_3_file}.")
        
        aug_embeddings_3_file = f"augmented_embeddings/gpt2/3/{topic}_embedding_{idx}.npy"
        os.makedirs(os.path.dirname(aug_embeddings_3_file), exist_ok=True)  # Dizini oluştur
        np.save(aug_embeddings_3_file, aug_embeddings_3)
        print(f"3 Selected augmented embeddings saved to {aug_embeddings_3_file}.")


In [None]:
#data_sizes = [850, 680, 510, 340, 170, 85]

results = {size: [] for size in data_sizes}

for size in data_sizes:
    print(f"\nTraining models with data size: {size}")

    # Prepare training data
    train_embeddings = []
    train_labels = []
    for topic in train_df['CategoryCode'].unique():
        topic_embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(topic_embeddings)
        train_labels += [topic] * len(topic_embeddings)

    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)

    # Train classifiers
    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        print(f"🔄 {clf_name} trained with data size {size}.")

        original_test_predictions = []
        augmented_test_predictions_5 = []
        augmented_test_predictions_3 = []

        for topic in test_df['CategoryCode'].unique():
            embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
            texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
            for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
                original_pred = clf.predict([embedding])[0]
                original_test_predictions.append(original_pred)

                aug_embeddings_5_file = f"augmented_embeddings/gpt2/5/{topic}_embedding_{idx}.npy"
                aug_embeddings_5 = np.load(aug_embeddings_5_file)

                aug_embeddings_3_file = f"augmented_embeddings/gpt2/3/{topic}_embedding_{idx}.npy"
                aug_embeddings_3 = np.load(aug_embeddings_3_file)

                aug_preds_5 = clf.predict(aug_embeddings_5)

                aug_preds_3 = clf.predict(aug_embeddings_3)

                final_pred_5 = majority_voting(original_pred, aug_preds_5.tolist())
                augmented_test_predictions_5.append(final_pred_5)

                final_pred_3 = majority_voting(original_pred, aug_preds_3.tolist())
                augmented_test_predictions_3.append(final_pred_3)

        original_accuracy = accuracy_score(test_labels, original_test_predictions)
        augmented_accuracy_5 = accuracy_score(test_labels, augmented_test_predictions_5)
        augmented_accuracy_3 = accuracy_score(test_labels, augmented_test_predictions_3)

        results[size].append({
            "classifier": clf_name,
            "original_accuracy": original_accuracy,
            "augmented_accuracy_5": augmented_accuracy_5,
            "augmented_accuracy_3": augmented_accuracy_3
        })
        print(f"✅ {clf_name} - Original Accuracy: {original_accuracy:.4f}, Augmented Accuracy (5): {augmented_accuracy_5:.4f}, Augmented Accuracy (3): {augmented_accuracy_3:.4f}")

plt.figure(figsize=(12, 8))
for clf_name in classifiers.keys():
    original_accuracies = [next((res['original_accuracy'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_5 = [next((res['augmented_accuracy_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_3 = [next((res['augmented_accuracy_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]

    plt.plot(data_sizes, original_accuracies, marker='o', linestyle='-', label=f"{clf_name} - Original")
    plt.plot(data_sizes, augmented_accuracies_5, marker='x', linestyle='--', label=f"{clf_name} - Augmented (5)")
    plt.plot(data_sizes, augmented_accuracies_3, marker='s', linestyle='-.', label=f"{clf_name} - Augmented (3)")

plt.title('Performance Comparison: Original vs Augmented Data')
plt.xlabel('Training Data Size')
plt.ylabel('Accuracy')
plt.xticks(data_sizes)
plt.legend(title='Classifiers')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
#data_sizes = [850, 680, 510, 340, 170, 85]

results = {size: [] for size in data_sizes}

for size in data_sizes:
    print(f"\nTraining models with data size: {size}")

    # Prepare training data
    train_embeddings = []
    train_labels = []
    for topic in train_df['CategoryCode'].unique():
        topic_embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(topic_embeddings)
        train_labels += [topic] * len(topic_embeddings)

    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)

    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        print(f"🔄 {clf_name} trained with data size {size}.")

        original_test_predictions = []
        augmented_test_predictions_bert_5 = []
        augmented_test_predictions_bert_3 = []
        augmented_test_predictions_llama_5 = []
        augmented_test_predictions_llama_3 = []
        augmented_test_predictions_gpt2_5 = []
        augmented_test_predictions_gpt2_3 = []

        for topic in test_df['CategoryCode'].unique():
            embeddings = np.load(f"embeddings/minilm/test/embeddings_{topic}.npy")
            texts = test_df[test_df['CategoryCode'] == topic]['Content'].tolist()
            for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
                original_pred = clf.predict([embedding])[0]
                original_test_predictions.append(original_pred)

                aug_embeddings_bert_5 = np.load(f"augmented_embeddings/bert/5/{topic}_embedding_{idx}.npy")
                aug_embeddings_llama_5 = np.load(f"augmented_embeddings/llama/5/{topic}_embedding_{idx}.npy")
                aug_embeddings_gpt2_5 = np.load(f"augmented_embeddings/gpt2/5/{topic}_embedding_{idx}.npy")

                aug_embeddings_bert_3 = np.load(f"augmented_embeddings/bert/3/{topic}_embedding_{idx}.npy")
                aug_embeddings_llama_3 = np.load(f"augmented_embeddings/llama/3/{topic}_embedding_{idx}.npy")
                aug_embeddings_gpt2_3 = np.load(f"augmented_embeddings/gpt2/3/{topic}_embedding_{idx}.npy")

                aug_preds_bert_5 = clf.predict(aug_embeddings_bert_5)
                aug_preds_bert_3 = clf.predict(aug_embeddings_bert_3)

                aug_preds_llama_5 = clf.predict(aug_embeddings_llama_5)
                aug_preds_llama_3 = clf.predict(aug_embeddings_llama_3)

                aug_preds_gpt2_5 = clf.predict(aug_embeddings_gpt2_5)
                aug_preds_gpt2_3 = clf.predict(aug_embeddings_gpt2_3)

                final_pred_bert_5 = majority_voting(original_pred, aug_preds_bert_5.tolist())
                final_pred_bert_3 = majority_voting(original_pred, aug_preds_bert_3.tolist())
                final_pred_llama_5 = majority_voting(original_pred, aug_preds_llama_5.tolist())
                final_pred_llama_3 = majority_voting(original_pred, aug_preds_llama_3.tolist())
                final_pred_gpt2_5 = majority_voting(original_pred, aug_preds_gpt2_5.tolist())
                final_pred_gpt2_3 = majority_voting(original_pred, aug_preds_gpt2_3.tolist())

                augmented_test_predictions_bert_5.append(final_pred_bert_5)
                augmented_test_predictions_bert_3.append(final_pred_bert_3)
                augmented_test_predictions_llama_5.append(final_pred_llama_5)
                augmented_test_predictions_llama_3.append(final_pred_llama_3)
                augmented_test_predictions_gpt2_5.append(final_pred_gpt2_5)
                augmented_test_predictions_gpt2_3.append(final_pred_gpt2_3)

        original_accuracy = accuracy_score(test_labels, original_test_predictions)
        augmented_accuracy_bert_5 = accuracy_score(test_labels, augmented_test_predictions_bert_5)
        augmented_accuracy_bert_3 = accuracy_score(test_labels, augmented_test_predictions_bert_3)
        augmented_accuracy_llama_5 = accuracy_score(test_labels, augmented_test_predictions_llama_5)
        augmented_accuracy_llama_3 = accuracy_score(test_labels, augmented_test_predictions_llama_3)
        augmented_accuracy_gpt2_5 = accuracy_score(test_labels, augmented_test_predictions_gpt2_5)
        augmented_accuracy_gpt2_3 = accuracy_score(test_labels, augmented_test_predictions_gpt2_3)

        results[size].append({
            "classifier": clf_name,
            "original_accuracy": original_accuracy,
            "augmented_accuracy_bert_5": augmented_accuracy_bert_5,
            "augmented_accuracy_bert_3": augmented_accuracy_bert_3,
            "augmented_accuracy_llama_5": augmented_accuracy_llama_5,
            "augmented_accuracy_llama_3": augmented_accuracy_llama_3,
            "augmented_accuracy_gpt2_5": augmented_accuracy_gpt2_5,
            "augmented_accuracy_gpt2_3": augmented_accuracy_gpt2_3
        })
        print(f"✅ {clf_name} - Original Accuracy: {original_accuracy:.4f}, BERT-5: {augmented_accuracy_bert_5:.4f}, BERT-3: {augmented_accuracy_bert_3:.4f}, LLama-5: {augmented_accuracy_llama_5:.4f}, LLama-3: {augmented_accuracy_llama_3:.4f}, GPT2-5: {augmented_accuracy_gpt2_5:.4f}, GPT2-3: {augmented_accuracy_gpt2_3:.4f}")

# Plot comparison
plt.figure(figsize=(14, 10))
for clf_name in classifiers.keys():
    original_accuracies = [next((res['original_accuracy'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_bert_5 = [next((res['augmented_accuracy_bert_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_bert_3 = [next((res['augmented_accuracy_bert_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_llama_5 = [next((res['augmented_accuracy_llama_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_llama_3 = [next((res['augmented_accuracy_llama_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_gpt2_5 = [next((res['augmented_accuracy_gpt2_5'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]
    augmented_accuracies_gpt2_3 = [next((res['augmented_accuracy_gpt2_3'] for res in results[size] if res['classifier'] == clf_name), None) for size in data_sizes]

    plt.plot(data_sizes, original_accuracies, marker='o', linestyle='-', label=f"{clf_name} - Original")
    plt.plot(data_sizes, augmented_accuracies_bert_5, marker='x', linestyle='--', label=f"{clf_name} - BERT (5)")
    plt.plot(data_sizes, augmented_accuracies_bert_3, marker='s', linestyle='--', label=f"{clf_name} - BERT (3)")
    plt.plot(data_sizes, augmented_accuracies_llama_5, marker='^', linestyle='-.', label=f"{clf_name} - LLama (5)")
    plt.plot(data_sizes, augmented_accuracies_llama_3, marker='v', linestyle='-.', label=f"{clf_name} - LLama (3)")
    plt.plot(data_sizes, augmented_accuracies_gpt2_5, marker='d', linestyle=':', label=f"{clf_name} - GPT-2 (5)")
    plt.plot(data_sizes, augmented_accuracies_gpt2_3, marker='p', linestyle=':', label=f"{clf_name} - GPT-2 (3)")

plt.title('Performance Comparison: Original vs Augmented Data')
plt.xlabel('Training Data Size')
plt.ylabel('Accuracy')
plt.xticks(data_sizes)
plt.legend(title='Classifiers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


-----

In [None]:
test_example = test_df.iloc[0]['Content']
print(f"Original Test Example: {test_example}\n")

augmented_texts = augment_text_with_berturk(test_example, num_variations=3)
print("Augmented Examples:")
for idx, text in enumerate(augmented_texts, start=1):
    print(f"  Augmented {idx}: {text}")
print("\n")

for size in data_sizes:
    print(f"\n=== Training with data size: {size} ===")

    train_embeddings = []
    train_labels = []
    for topic in train_df['CategoryCode'].unique():
        topic_embeddings = np.load(f"embeddings/minilm/train/embeddings_{topic}.npy")[:size // len(train_df['CategoryCode'].unique())]
        train_embeddings.append(topic_embeddings)
        train_labels += [topic] * len(topic_embeddings)

    train_embeddings = np.vstack(train_embeddings)
    train_labels = np.array(train_labels)

    for clf_name, clf in classifiers.items():
        clf.fit(train_embeddings, train_labels)
        print(f"  🔄 {clf_name} trained.")

        original_embedding = np.load(f"embeddings/minilm/test/embeddings_{test_df.iloc[0]['CategoryCode']}.npy")[0].reshape(1, -1)
        original_prediction = clf.predict(original_embedding)[0]
        print(f"  {clf_name} - Original Prediction: {original_prediction}")

        augmented_embeddings = get_embeddings(augmented_texts, embedding_models['minilm'])
        augmented_predictions = clf.predict(augmented_embeddings)
        for idx, pred in enumerate(augmented_predictions, start=1):
            print(f"  {clf_name} - Augmented {idx} Prediction: {pred}")

        all_predictions = [original_prediction] + augmented_predictions.tolist()
        final_prediction = max(set(all_predictions), key=all_predictions.count)
        print(f"  {clf_name} - Final Prediction (Majority Voting): {final_prediction}")
    print("\n")


-----

In [None]:
import os
import numpy as np
from tqdm import tqdm

def augment_training_data(train_texts, num_variations=4):
    augmented_texts = []
    for text in tqdm(train_texts, desc="Augmenting training data"):
        augmented_variations = augment_text_with_llama(text, num_variations=num_variations)
        augmented_texts.extend(augmented_variations)
    return augmented_texts

def process_and_save_augmented_data(train_df, output_dir, model_name, num_variations=4):
    os.makedirs(output_dir, exist_ok=True)

    for topic in train_df['CategoryCode'].unique():
        topic_texts = train_df[train_df['CategoryCode'] == topic]['Content']
        
        augmented_file = os.path.join(output_dir, f"augmented_texts_{topic}.npy")
        if os.path.exists(augmented_file):
            print(f"ℹ️ Augmented texts already exist: {augmented_file}")
            augmented_texts = np.load(augmented_file, allow_pickle=True).tolist()
        else:
            augmented_texts = augment_training_data(topic_texts, num_variations=num_variations)
            np.save(augmented_file, np.array(augmented_texts, dtype=object))
            print(f"✅ Augmented texts saved: {augmented_file}")

        embeddings_file = os.path.join(output_dir, f"embeddings_{topic}.npy")
        if os.path.exists(embeddings_file):
            print(f"ℹ️ Embeddings already exist: {embeddings_file}")
        else:
            embeddings = get_embeddings(augmented_texts, model_name)
            np.save(embeddings_file, embeddings)
            print(f"✅ Embeddings saved: {embeddings_file}")

train_texts = train_df['Content']
augment_output_dir = "augmented_data/llama"
process_and_save_augmented_data(train_df, augment_output_dir, embedding_models['minilm'], num_variations=4)


In [None]:
def create_augmented_train_set(original_embeddings, augmented_embeddings, num_augmentations):
    new_train_embeddings = []
    new_train_labels = []

    for topic, orig_embed in original_embeddings.items():
        new_train_embeddings.append(orig_embed)
        new_train_labels += [topic] * len(orig_embed)

        if topic in augmented_embeddings:
            available_augmentations = len(augmented_embeddings[topic])
            if available_augmentations < num_augmentations:
                print(f"⚠️ Warning: Not enough augmented embeddings for topic {topic}. Available: {available_augmentations}, Requested: {num_augmentations}")
                num_augmentations = available_augmentations

            for i in range(num_augmentations):
                new_train_embeddings.append(augmented_embeddings[topic][i])
                new_train_labels.append(topic)

    new_train_embeddings = np.vstack(new_train_embeddings)
    new_train_labels = np.array(new_train_labels)

    return new_train_embeddings, new_train_labels


original_embeddings = {}
augmented_embeddings = {}

for topic in train_df['CategoryCode'].unique():
    orig_file = os.path.join("embeddings/minilm/train", f"embeddings_{topic}.npy")
    original_embeddings[topic] = np.load(orig_file)

    aug_file = os.path.join("augmented_data/minilm", f"embeddings_{topic}.npy")
    augmented_embeddings[topic] = np.load(aug_file)

# Eğitim süreci
results_augmented = {size: [] for size in data_sizes}

for size in data_sizes:
    subset_embeddings = {}
    for topic, embeddings in original_embeddings.items():
        subset_embeddings[topic] = embeddings[:size // len(original_embeddings)]

    for num_aug in [0, 1, 2, 4]:
        print(f"\nData size: {size}, Augmentations per example: {num_aug}")

        train_embeddings, train_labels = create_augmented_train_set(
            subset_embeddings, augmented_embeddings, num_aug
        )

        for clf_name, clf in classifiers.items():
            clf.fit(train_embeddings, train_labels)
            predictions = clf.predict(test_embeddings)
            accuracy = accuracy_score(test_labels, predictions)
            f1 = f1_score(test_labels, predictions, average='weighted')

            print(f"🔄 {clf_name} - Accuracy={accuracy:.4f}, F1={f1:.4f}")
            results_augmented[size].append({
                "augmentations": num_aug,
                "classifier": clf_name,
                "accuracy": accuracy,
                "f1_score": f1
            })


In [None]:
import matplotlib.pyplot as plt

def plot_results(results, metric="accuracy"):
    plt.figure(figsize=(12, 8))
    for clf_name in classifiers.keys():
        for num_aug in [0, 1, 2, 4]:
            metric_values = []
            for size in data_sizes:
                clf_results = [res for res in results[size] if res['classifier'] == clf_name and res['augmentations'] == num_aug]
                if clf_results:
                    metric_values.append(clf_results[0][metric])
                else:
                    metric_values.append(None)

            plt.plot(data_sizes, metric_values, marker='o', label=f"{clf_name} (Aug={num_aug})")

    plt.title(f"Effect of Data Size and Augmentations on {metric.capitalize()}")
    plt.xlabel("Data Size")
    plt.ylabel(metric.capitalize())
    plt.legend(title="Classifier and Augmentations")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

plot_results(results_augmented, metric="accuracy")