<a href="https://colab.research.google.com/github/cemredogan-ceng/HABER-X-AI-Based-News-Classification/blob/main/habers%C4%B1n%C4%B1fland%C4%B1rma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Gerekli kütüphaneleri yükleyin
!pip install transformers simpletransformers pandas scikit-learn imblearn gradio

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import gradio as gr

# Veriyi yükleyin ve inceleyin
data_path = "metin_verileri.xlsx"
data = pd.read_excel(data_path)
data = data.rename(columns={"kategori": "labels", "metin": "text"})

# Etiketleri sayısallaştırma
data["labels"] = data["labels"].astype("category")
label_map = dict(enumerate(data["labels"].cat.categories))
data["labels"] = data["labels"].cat.codes

# Metin verisini TfidfVectorizer ile dönüştürme
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data["text"]).toarray()
y = data["labels"]

# SMOTE ile veri dengesini artırma
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Resampled veriyi DataFrame olarak oluşturma
resampled_data = pd.DataFrame(X_resampled, columns=[f"feature_{i}" for i in range(X_resampled.shape[1])])
resampled_data["labels"] = y_resampled

# Veriyi train ve test olarak ayırma
train_df, test_df = train_test_split(resampled_data, test_size=0.2, random_state=42)

# ALBERT modeli için metin ve etiketleri geri yükleme
train_texts = tfidf.inverse_transform(train_df.drop(columns=["labels"]).values)
train_labels = train_df["labels"].values
test_texts = tfidf.inverse_transform(test_df.drop(columns=["labels"]).values)
test_labels = test_df["labels"].values

train_df = pd.DataFrame({"text": [" ".join(text) for text in train_texts], "labels": train_labels})
test_df = pd.DataFrame({"text": [" ".join(text) for text in test_texts], "labels": test_labels})

# ALBERT modeli oluşturma
model = ClassificationModel(
    "albert", "albert-base-v2",
    num_labels=len(data["labels"].unique()),
    args={
        "reprocess_input_data": True,
        "overwrite_output_dir": True,
        "num_train_epochs": 5,  # Daha fazla epoch
        "learning_rate": 2e-5,  # Daha iyi öğrenme oranı
        "train_batch_size": 16,
        "eval_batch_size": 16,
        "max_seq_length": 128,
        "output_dir": "outputs/",
        "save_steps": -1,
    },
    use_cuda=False  # CUDA desteği varsa True yapabilirsiniz
)

# Modeli eğitme
model.train_model(train_df)

# Modeli değerlendirme
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=accuracy_score)

# Performans raporu
print("Accuracy:", result["acc"])
y_pred = [model_outputs[i].argmax() for i in range(len(model_outputs))]
y_true = test_df["labels"].tolist()
print(classification_report(y_true, y_pred))

# Gradio arayüzü

def classify_text(input_text):
    predictions, _ = model.predict([input_text])
    return label_map[predictions[0]]

interface = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(lines=5, placeholder="Metni buraya yazın..."),
    outputs="text",
    title="Haber Sınıflandırıcı",
    description="Bir metin girin ve sınıflandırmasını öğrenin."
)

interface.launch()