In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# NLTK gerekli verileri indir (bir kereye mahsus)
# nltk.download('stopwords')
# nltk.download('wordnet')

def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def remove_numbers(text, replace_with_tag=False):
    if replace_with_tag:
        return re.sub(r'\d+', '<number>', text)
    else:
        return re.sub(r'\d+', '', text)

def remove_stopwords(text, lang='english'):
    stop_words = set(stopwords.words(lang))
    return " ".join([word for word in text.split() if word not in stop_words])

def apply_stemming(text):
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])

def apply_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def preprocess_text(
    text,
    lowercase=True,
    remove_punct=True,
    remove_nums=True,
    replace_nums_with_tag=False,
    remove_sw=True,
    apply_stem=False,
    apply_lemma=True,
    lang='english'
):
    if lowercase:
        text = to_lowercase(text)
    if remove_punct:
        text = remove_punctuation(text)
    if remove_nums:
        text = remove_numbers(text, replace_with_tag=replace_nums_with_tag)
    if remove_sw:
        text = remove_stopwords(text, lang=lang)
    if apply_stem:
        text = apply_stemming(text)
    if apply_lemma:
        text = apply_lemmatization(text)
    return text

# Veri yükleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])

# Metin ön işleme
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)

# Etiketleri sayısal forma dönüştürme
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

# Eğitim/Test ayrımı
X = dataset['cleaned_review'].values
y = dataset['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization ve dizileştirme (Keras Tokenizer kullanarak)
max_num_words = 10000  # Belirli bir kelime sayısı sınırı
max_sequence_length = 200  # Her yorumun maksimum uzunluğu
tokenizer = Tokenizer(num_words=max_num_words, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences (kısa dizileri max_sequence_length'e doldur)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')



In [3]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# Tokenizer'ı JSON formatında kaydetme
tokenizer_json = tokenizer.to_json()
with open('models/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

In [None]:
# Tokenizer yükleme
with open('models/tokenizer.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    loaded_tokenizer = tokenizer_from_json(data)

In [4]:
dataset

Unnamed: 0,review,sentiment,cleaned_review,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job wasnt creative or...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,0
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary school nu...,0
49998,I'm going to have to disagree with the previou...,negative,im going disagree previous comment side maltin...,0


In [5]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

################################
# 128'lik GloVe'suz LSTM Modeli 
################################

def create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True):
    model = Sequential()
    
    if embedding_matrix is not None:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length,
                            weights=[embedding_matrix],
                            trainable=trainable))
    else:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length))

    # Bidirectional LSTM (return_sequences=False yapıyoruz)
    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model

# Vocab size (tokenizer'dan elde edilir)
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
max_sequence_length = 200
embedding_dim = 100
batch_size = 128
epochs = 10

model_improved = create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True)
print(model_improved.summary())

# Callback'ler
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
model_checkpoint = ModelCheckpoint('models/lstm/lstm_model_128.keras', monitor='val_loss', save_best_only=True, verbose=1)

history = model_improved.fit(
    X_train_padded,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr, model_checkpoint]
)

loss_improved, acc_improved = model_improved.evaluate(X_test_padded, y_test, verbose=0)
print(f"Improved LSTM Test Accuracy: {acc_improved:.4f}")




None
Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 434ms/step - accuracy: 0.7023 - loss: 0.5351
Epoch 1: val_loss improved from inf to 0.33752, saving model to models/lstm/lstm_model_128.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 478ms/step - accuracy: 0.7026 - loss: 0.5346 - val_accuracy: 0.8600 - val_loss: 0.3375 - learning_rate: 0.0010
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 591ms/step - accuracy: 0.9081 - loss: 0.2472
Epoch 2: val_loss improved from 0.33752 to 0.31633, saving model to models/lstm/lstm_model_128.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 677ms/step - accuracy: 0.9081 - loss: 0.2472 - val_accuracy: 0.8723 - val_loss: 0.3163 - learning_rate: 0.0010
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733ms/step - accuracy: 0.9235 - loss: 0.2094
Epoch 3: val_loss improved from 0.31633 to 0.31450, saving model to model

In [9]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

################################
# 64'lük GloVe'suz LSTM Modeli
################################

def create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True):
    model = Sequential()
    
    if embedding_matrix is not None:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length,
                            weights=[embedding_matrix],
                            trainable=trainable))
    else:
        model.add(Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            input_length=max_sequence_length))

    # Bidirectional LSTM (64 boyutlu)
    model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model

# Vocab size (tokenizer'dan elde edilir)
vocab_size = min(len(tokenizer.word_index) + 1, 10000)
max_sequence_length = 200
embedding_dim = 100
batch_size = 128
epochs = 10

model_improved = create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=None, trainable=True)
print(model_improved.summary())

# Callback'ler
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
model_checkpoint = ModelCheckpoint('models/lstm/lstm_model_64.keras', monitor='val_loss', save_best_only=True, verbose=1)

history = model_improved.fit(
    X_train_padded,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr, model_checkpoint]
)

loss_improved, acc_improved = model_improved.evaluate(X_test_padded, y_test, verbose=0)
print(f"Improved LSTM Test Accuracy: {acc_improved:.4f}")




None
Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432ms/step - accuracy: 0.6691 - loss: 0.5629
Epoch 1: val_loss improved from inf to 0.29543, saving model to models/lstm/lstm_model_64.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 478ms/step - accuracy: 0.6696 - loss: 0.5625 - val_accuracy: 0.8813 - val_loss: 0.2954 - learning_rate: 0.0010
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 459ms/step - accuracy: 0.9078 - loss: 0.2476
Epoch 2: val_loss improved from 0.29543 to 0.28855, saving model to models/lstm/lstm_model_64.keras
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 484ms/step - accuracy: 0.9078 - loss: 0.2477 - val_accuracy: 0.8819 - val_loss: 0.2885 - learning_rate: 0.0010
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492ms/step - accuracy: 0.9290 - loss: 0.1988
Epoch 3: val_loss did not improve from 0.28855
[1m250/250[0m [32m━━━━━━━

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from backend.preprocessing import preprocess_text  # Kendi preprocessing fonksiyonunuzu içeren modül

################################
# 64'lük GloVe'lu LSTM Modeli
################################

def load_glove_embeddings(filepath, word_index, embedding_dim=100, vocab_size=None):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    if vocab_size is None:
        vocab_size = len(word_index) + 1  # +1: tokenizer indeksleri 1'den başlatır
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix, trainable=True):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=trainable))
    model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Veriyi yükleme ve ön işleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

# X ve y ayırma
review_texts = dataset['cleaned_review'].values
labels = dataset['label'].values

# Tokenizer ile kelimeleri indeksle
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Metinleri dizilere çevir
sequences = tokenizer.texts_to_sequences(review_texts)
max_sequence_length = 100  # Sabit uzunluk
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = np.array(labels)

# Eğitim/Test ayrımı
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GloVe vektörlerini yükle
embedding_dim = 100
glove_path = 'glove.6B.100d.txt'
glove_embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim=embedding_dim, vocab_size=vocab_size)

# Modeli oluştur
model_glove = create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=glove_embedding_matrix, trainable=True)
print(model_glove.summary())

# Modeli eğit
model_glove.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_split=0.2)

# Değerlendir
loss_glove, acc_glove = model_glove.evaluate(X_test_padded, y_test, verbose=0)
print(f"GloVe Embedding Test Accuracy: {acc_glove:.4f}")

# Modeli kaydet
model_glove.save('models/lstm/lstm_model_64_glove.h5')
print("Model kaydedildi: models/lstm/lstm_model_64_glove.h5")




None
Epoch 1/5


In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from backend.preprocessing import preprocess_text  # Kendi preprocessing fonksiyonunuzu içeren modül


################################
# 128'lik GloVe'lu LSTM Modeli
################################

def load_glove_embeddings(filepath, word_index, embedding_dim=100, vocab_size=None):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    if vocab_size is None:
        vocab_size = len(word_index) + 1
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix, trainable=True):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=trainable))
    # LSTM boyutu 128 yapıldı
    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', dtype='float32'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Veriyi yükleme ve ön işleme
dataset = pd.read_csv('data.csv', delimiter=',', header=0, names=['review','sentiment'])
dataset['cleaned_review'] = dataset['review'].apply(preprocess_text)
dataset['label'] = dataset['sentiment'].map({'positive':1, 'negative':0})

review_texts = dataset['cleaned_review'].values
labels = dataset['label'].values

# Tokenizer ile indeksleme
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Metinleri dizilere çevir ve pad et
max_sequence_length = 100
sequences = tokenizer.texts_to_sequences(review_texts)
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = np.array(labels)

# Eğitim/Test ayrımı
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GloVe vektörlerini yükle
embedding_dim = 100
glove_path = 'glove.6B.100d.txt'
glove_embedding_matrix = load_glove_embeddings(glove_path, word_index, embedding_dim=embedding_dim, vocab_size=vocab_size)

# Modeli oluştur
model_glove = create_lstm_model(vocab_size, embedding_dim, max_sequence_length, embedding_matrix=glove_embedding_matrix, trainable=True)
print(model_glove.summary())

# Modeli eğit
model_glove.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_split=0.2)

# Modeli değerlendir
loss_glove, acc_glove = model_glove.evaluate(X_test_padded, y_test, verbose=0)
print(f"GloVe Embedding Test Accuracy: {acc_glove:.4f}")

# Modeli kaydet
model_glove.save('models/lstm/lstm_model_128_glove.h5')
print("Model kaydedildi: models/lstm/lstm_model_128_glove.h5")




None
Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 373ms/step - accuracy: 0.6950 - loss: 0.5632 - val_accuracy: 0.8583 - val_loss: 0.3277
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 350ms/step - accuracy: 0.8686 - loss: 0.3242 - val_accuracy: 0.8769 - val_loss: 0.2932
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 330ms/step - accuracy: 0.9190 - loss: 0.2128 - val_accuracy: 0.8755 - val_loss: 0.3137
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 352ms/step - accuracy: 0.9592 - loss: 0.1128 - val_accuracy: 0.8730 - val_loss: 0.3722
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 306ms/step - accuracy: 0.9754 - loss: 0.0692 - val_accuracy: 0.8711 - val_loss: 0.4149




GloVe Embedding Test Accuracy: 0.8727
Model kaydedildi: models/lstm/lstm_model_128_glove.h5
