In [None]:
# Gerekli kütüphaneleri kur
!pip install -q tensorflow
!pip install -q gradio
!pip install -q nltk

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Concatenate, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# NLTK Stopwords indir
nltk.download('stopwords')

# Model Parametreleri
MAX_WORDS = 15000
MAX_LEN = 250
EMBEDDING_DIM = 256
BATCH_SIZE = 32
EPOCHS = 25

print("Kutuphaneler yuklendi.")

In [None]:
# CSV dosyasını yükle
from google.colab import files
print("Lutfen haberler_top5.csv dosyasini yukleyin:")
uploaded = files.upload()

# Veriyi oku
df = pd.read_csv('haberler_top5.csv')

# 'genel' kategorisini veri setinden çıkar
df = df[df['category'] != 'genel']

print("Kategori dagilimi:")
print(df['category'].value_counts())

# Text Temizleme Fonksiyonu
stop_words = set(stopwords.words('turkish'))

def preprocess_text(text):
    text = text.lower() # Küçük harf
    text = re.sub(r'\d+', '', text) # Sayıları kaldır
    text = re.sub(r'[^\w\s]', '', text) # Noktalama işaretlerini kaldır
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Stopwords kaldır
    return text

df['text_clean'] = df['text'].apply(preprocess_text)
print("Veri on isleme tamamlandi.")

In [None]:
# CSV dosyasını yükle
from google.colab import files
print("Lutfen haberler_top5.csv dosyasini yukleyin:")
uploaded = files.upload()

# Veriyi oku
df = pd.read_csv('haberler_top5.csv')

# 'genel' kategorisini veri setinden çıkar
df = df[df['category'] != 'genel']

print("Kategori dagilimi:")
print(df['category'].value_counts())

# Text Temizleme Fonksiyonu
stop_words = set(stopwords.words('turkish'))

def preprocess_text(text):
    text = text.lower() # Küçük harf
    text = re.sub(r'\d+', '', text) # Sayıları kaldır
    text = re.sub(r'[^\w\s]', '', text) # Noktalama işaretlerini kaldır
    text = ' '.join([word for word in text.split() if word not in stop_words]) # Stopwords kaldır
    return text

df['text_clean'] = df['text'].apply(preprocess_text)
print("Veri on isleme tamamlandi.")

In [None]:
# Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text_clean'])
sequences = tokenizer.texts_to_sequences(df['text_clean'])
X = pad_sequences(sequences, maxlen=MAX_LEN)

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Veri setleri hazirlandi.")
print(f"Egitim boyutu: {X_train.shape}")
print(f"Test boyutu: {X_test.shape}")

In [None]:
def build_hybrid_model(num_classes):
    inputs = Input(shape=(MAX_LEN,))
    
    # Embedding Layer
    x = Embedding(MAX_WORDS, EMBEDDING_DIM)(inputs)
    
    # Branch 1: CNN (Feature Extraction)
    x1 = Conv1D(128, 5, activation='relu')(x)
    x1 = GlobalMaxPooling1D()(x1)
    
    # Branch 2: LSTM (Sequence Modeling)
    x2 = Bidirectional(LSTM(128))(x)
    
    # Concatenate
    concat = Concatenate()([x1, x2])
    
    # Dense Layers
    x = Dense(128, activation='relu')(concat)
    x = Dropout(0.5)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

num_classes = len(np.unique(y))
model = build_hybrid_model(num_classes)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Early Stopping ile eğitim
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, y_test),
    callbacks=[early_stop]
)
print("Egitim tamamlandi.")

In [None]:
# Tahminler
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.ylabel('Gercek')
plt.xlabel('Tahmin')
plt.savefig('confusion_matrix.png')
plt.show()

# Eğitim Grafikleri
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.plot(history.history['accuracy'], label='Train Accuracy')
ax1.plot(history.history['val_accuracy'], label='Val Accuracy')
ax1.set_title('Model Accuracy')
ax1.legend()

ax2.plot(history.history['loss'], label='Train Loss')
ax2.plot(history.history['val_loss'], label='Val Loss')
ax2.set_title('Model Loss')
ax2.legend()

plt.savefig('training_history.png')
plt.show()

In [None]:
# Modeli ve araçları kaydet
model.save('news_classifier_hybrid.keras')

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Dosyalar kaydedildi, indirme hazirlaniyor...")

# Dosyaları indir
from google.colab import files
files.download('news_classifier_hybrid.keras')
files.download('tokenizer.pkl')
files.download('label_encoder.pkl')
files.download('confusion_matrix.png')
files.download('training_history.png')

In [None]:
import gradio as gr

def predict_category(text):
    # Preprocess
    cleaned_text = preprocess_text(text)
    # Tokenize
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    # Predict
    pred = model.predict(padded)
    pred_class_index = np.argmax(pred)
    confidence = float(np.max(pred))
    
    result = label_encoder.classes_[pred_class_index]
    
    return f"Kategori: {result}", f"Guven Skoru: {confidence:.2f}"

# Arayüz
demo = gr.Interface(
    fn=predict_category,
    inputs=gr.Textbox(lines=5, placeholder="Haber metnini buraya girin..."),
    outputs=[gr.Textbox(label="Tahmin"), gr.Textbox(label="Skor")],
    title="Turkce Haber Siniflandirma (Hybrid Model)",
    description="Spor, Ekonomi, Dunya veya Guncel kategorisindeki haberi yapistirin."
)

demo.launch(share=True)