In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# NLTK gerekli verileri indir (bir kereye mahsus)
# nltk.download('stopwords')
# nltk.download('wordnet')

def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def remove_numbers(text, replace_with_tag=False):
    if replace_with_tag:
        return re.sub(r'\d+', '<number>', text)
    else:
        return re.sub(r'\d+', '', text)

def remove_stopwords(text, lang='english'):
    stop_words = set(stopwords.words(lang))
    return " ".join([word for word in text.split() if word not in stop_words])

def apply_stemming(text):
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])

def apply_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def preprocess_text(
    text,
    lowercase=True,
    remove_punct=True,
    remove_nums=True,
    replace_nums_with_tag=False,
    remove_sw=True,
    apply_stem=False,
    apply_lemma=True,
    lang='english'
):
    if lowercase:
        text = to_lowercase(text)
    if remove_punct:
        text = remove_punctuation(text)
    if remove_nums:
        text = remove_numbers(text, replace_with_tag=replace_nums_with_tag)
    if remove_sw:
        text = remove_stopwords(text, lang=lang)
    if apply_stem:
        text = apply_stemming(text)
    if apply_lemma:
        text = apply_lemmatization(text)
    return text

# Veri yükleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])

# Metin ön işleme
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)

# Etiketleri sayısal forma dönüştürme
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

# Eğitim/Test ayrımı
X = dataset['cleaned_review'].values
y = dataset['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization ve dizileştirme (Keras Tokenizer kullanarak)
max_num_words = 10000  # Belirli bir kelime sayısı sınırı
max_sequence_length = 200  # Her yorumun maksimum uzunluğu
tokenizer = Tokenizer(num_words=max_num_words, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences (kısa dizileri max_sequence_length'e doldur)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')



In [3]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

############################
# 128'lik GloVe'suz GRU Modeli 
############################

def create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True):
    model = Sequential()
    
    if embedding_matrix is not None:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length,
                            weights=[embedding_matrix],
                            trainable=trainable))
    else:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length))

    # Bidirectional GRU (return_sequences=False yapılarak tek çıkış elde ediliyor)
    model.add(Bidirectional(GRU(128, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model

# Vocab size (tokenizer'dan elde edilir)
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
max_sequence_length = 200
embedding_dim = 100
batch_size = 128
epochs = 10

model_gru = create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True)
print(model_gru.summary())

# Callback'ler
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
model_checkpoint = ModelCheckpoint('models/gru/gru_model_128.keras', monitor='val_loss', save_best_only=True, verbose=1)

history = model_gru.fit(
    X_train_padded,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr, model_checkpoint]
)

loss_gru, acc_gru = model_gru.evaluate(X_test_padded, y_test, verbose=0)
print(f"GRU Test Accuracy: {acc_gru:.4f}")




None
Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 718ms/step - accuracy: 0.6447 - loss: 0.6012
Epoch 1: val_loss improved from inf to 0.34175, saving model to models/gru/gru_model_128.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 791ms/step - accuracy: 0.6451 - loss: 0.6007 - val_accuracy: 0.8620 - val_loss: 0.3417 - learning_rate: 0.0010
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765ms/step - accuracy: 0.9203 - loss: 0.2172
Epoch 3: val_loss improved from 0.32974 to 0.29673, saving model to models/gru/gru_model_128.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 851ms/step - accuracy: 0.9203 - loss: 0.2172 - val_accuracy: 0.8845 - val_loss: 0.2967 - learning_rate: 0.0010
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 657ms/step - accuracy: 0.9395 - loss: 0.1719
Epoch 4: val_loss did not improve from 0.29673
[1m250/250[0m [32m━━━━━━━━━

In [5]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

############################
# 64'lük GloVe'suz GRU Modeli 
############################

def create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True):
    model = Sequential()
    
    if embedding_matrix is not None:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length,
                            weights=[embedding_matrix],
                            trainable=trainable))
    else:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length))

    # Bidirectional GRU (64 boyutlu, return_sequences=False)
    model.add(Bidirectional(GRU(64, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model

# Vocab size (tokenizer'dan elde edilir)
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
max_sequence_length = 200
embedding_dim = 100
batch_size = 128
epochs = 10

model_gru = create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True)
print(model_gru.summary())

# Callback'ler
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
model_checkpoint = ModelCheckpoint('models/gru/gru_model_64.keras', monitor='val_loss', save_best_only=True, verbose=1)

history = model_gru.fit(
    X_train_padded,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr, model_checkpoint]
)

loss_gru, acc_gru = model_gru.evaluate(X_test_padded, y_test, verbose=0)
print(f"GRU Test Accuracy: {acc_gru:.4f}")




None
Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288ms/step - accuracy: 0.6501 - loss: 0.5891
Epoch 1: val_loss improved from inf to 0.34670, saving model to models/gru/gru_model_64.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 311ms/step - accuracy: 0.6505 - loss: 0.5887 - val_accuracy: 0.8564 - val_loss: 0.3467 - learning_rate: 0.0010
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step - accuracy: 0.8930 - loss: 0.2852
Epoch 2: val_loss improved from 0.34670 to 0.31227, saving model to models/gru/gru_model_64.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 324ms/step - accuracy: 0.8930 - loss: 0.2852 - val_accuracy: 0.8735 - val_loss: 0.3123 - learning_rate: 0.0010
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 0.9248 - loss: 0.2109
Epoch 3: val_loss improved from 0.31227 to 0.29990, saving model to models/gru/gr

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam
from backend.preprocessing import preprocess_text  # Kendi preprocessing fonksiyonunuzu içeren modül


################################
# 64'lük GloVe'lu GRU Modeli
################################

def load_glove_embeddings(filepath, word_index, embedding_dim=100, vocab_size=None):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    if vocab_size is None:
        vocab_size = len(word_index) + 1
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix, trainable=True):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=trainable))
    model.add(Bidirectional(GRU(64, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Veriyi yükleme ve ön işleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

review_texts = dataset['cleaned_review'].values
labels = dataset['label'].values

# Tokenizer ile indeksleme
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Metinleri dizilere çevir ve pad et
max_sequence_length = 100
sequences = tokenizer.texts_to_sequences(review_texts)
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = np.array(labels)

# Eğitim/Test ayrımı
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GloVe vektörlerini yükle
embedding_dim = 100
glove_path = 'glove.6B.100d.txt'
glove_embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim=embedding_dim, vocab_size=vocab_size)

# Modeli oluştur
model_glove_gru = create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=glove_embedding_matrix, trainable=True)
print(model_glove_gru.summary())

# Modeli eğit
model_glove_gru.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_split=0.2)

# Modeli değerlendir
loss_glove, acc_glove = model_glove_gru.evaluate(X_test_padded, y_test, verbose=0)
print(f"GloVe Embedding GRU (64) Test Accuracy: {acc_glove:.4f}")

# Modeli kaydet
model_glove_gru.save('models/gru/gru_model_64_glove.h5')
print("Model kaydedildi: models/glove_gru_model_64.h5")


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam
from backend.preprocessing import preprocess_text  # Kendi preprocessing fonksiyonunuzu içeren modül


################################
# 128'lik GloVe'lu GRU Modeli
################################

def load_glove_embeddings(filepath, word_index, embedding_dim=100, vocab_size=None):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    if vocab_size is None:
        vocab_size = len(word_index) + 1
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix, trainable=True):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=trainable))

    model.add(Bidirectional(GRU(128, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Veriyi yükleme ve ön işleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

review_texts = dataset['cleaned_review'].values
labels = dataset['label'].values

# Tokenizer ile indeksleme
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Metinleri dizilere çevir ve pad et
max_sequence_length = 100
sequences = tokenizer.texts_to_sequences(review_texts)
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = np.array(labels)

# Eğitim/Test ayrımı
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GloVe vektörlerini yükle
embedding_dim = 100
glove_path = 'glove.6B.100d.txt'
glove_embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim=embedding_dim, vocab_size=vocab_size)

# Modeli oluştur
model_glove_gru = create_gru_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=glove_embedding_matrix, trainable=True)
print(model_glove_gru.summary())

# Modeli eğit
model_glove_gru.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_split=0.2)

# Modeli değerlendir
loss_glove, acc_glove = model_glove_gru.evaluate(X_test_padded, y_test, verbose=0)
print(f"GloVe Embedding GRU Test Accuracy: {acc_glove:.4f}")

# Modeli kaydet
model_glove_gru.save('models/gru/gru_model_128_glove.h5')
print("Model kaydedildi: models/gru/gru_model_128_glove.h5")




None
Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 282ms/step - accuracy: 0.6778 - loss: 0.5773 - val_accuracy: 0.8609 - val_loss: 0.3426
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 316ms/step - accuracy: 0.8714 - loss: 0.3107 - val_accuracy: 0.8836 - val_loss: 0.2790
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 332ms/step - accuracy: 0.9245 - loss: 0.1976 - val_accuracy: 0.8814 - val_loss: 0.2937
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 347ms/step - accuracy: 0.9589 - loss: 0.1145 - val_accuracy: 0.8804 - val_loss: 0.3311
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 344ms/step - accuracy: 0.9793 - loss: 0.0605 - val_accuracy: 0.8764 - val_loss: 0.4448




GloVe Embedding GRU Test Accuracy: 0.8804
Model kaydedildi: models/gru/gru_model_128_glove.h5
