In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import enum
import re
import nltk 

from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [2]:
#Veri setinin yüklenmesi ve örnek veri
dataset = pd.read_csv("data.csv",delimiter=";",header=None,names=["Review","Rating"])
print("Verisetinde {} adet cümle mevcut.".format(len(dataset)))
dataset.head()

Verisetinde 50000 adet cümle mevcut.


Unnamed: 0,Review,Rating
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [3]:
#Veri ön işleme
#Ön işleme öncesi örnek cümle
print(dataset['Review'].values[0],"\n\n")

#stopwords ön hazırlık
WPT = nltk.WordPunctTokenizer()
stop_word_list = nltk.corpus.stopwords.words('english')
stop_word_list.append("'ll")
stop_word_list.append("n't")
stop_word_list.append("br")
stop_word_list.append(".")
stop_word_list.append(",")
stop_word_list.append("<")
stop_word_list.append(">")
stop_word_list.append("/")
#print(stop_word_list)

def token(values):
    words = nltk.tokenize.word_tokenize(values)
    filtered_words = [word for word in words if word not in stop_word_list]
    not_stopword_doc = " ".join(filtered_words)
    return not_stopword_doc

#stemming ön hazırlık
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

#Stemmer nesnesi oluşturulması
porter = PorterStemmer() 
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

#lemmatization ön hazırlık
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

wordnet_lemmatizer = WordNetLemmatizer()
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmaSentence(sentence):
    token_words=word_tokenize(sentence)
    lemma_sentence=[]
    for word in token_words:
        lemma_sentence.append(wordnet_lemmatizer.lemmatize(word,pos='v'))
        lemma_sentence.append(" ")
    return "".join(lemma_sentence)

#büyük harflerin küçük harfe çevrilmesi
dataset['Review'] = dataset['Review'].apply(lambda x: x.lower())

#stopwordlerin temizlenmesi
dataset['Review'] = dataset['Review'].apply(lambda x: token(x))

#Lemmatization işlemi
dataset['Review'] = dataset['Review'].apply(lambda x: lemmaSentence(x))

#stemming işlemi
dataset['Review'] = dataset['Review'].apply(lambda x: stemSentence(x))
                              
data = dataset['Review'].values.tolist()
target = dataset['Rating'].values.tolist()
target = np.array(target)

#Ön işleme sonrası aynı cümle
print(data[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [4]:
#Cümlelerin içinde geçen kelimelerden 10000 kelimelik bir sözlük oluşturuluyor.
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data)
#tokenizer.word_index

#Cümleler sayılara dönüştürülüyor
data_tokens = tokenizer.texts_to_sequences(data)

#Cümlelerin önceki ve sonraki hallerinin görüntülenmesi
IDX = 0
print("Öncesi: {}".format(data[IDX]))
print("Sonrası: {}".format(np.array(data_tokens[IDX])))

Öncesi: one review mention watch 1 oz episod hook right exactli happen me . first thing strike oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word . call oz nicknam give oswald maximum secur state penitentari focu mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani .. aryan muslim gangsta latino christian italian irish .... scuffl death star dodgi deal shadi agreement never far away . would say main appeal show due fact go show would dare forget pretti pictur paint mainstream audienc forget charm forget romanc ... oz mess around first episod ever saw strike nasti surreal could say readi watch develop tast oz get accustom high level graphic violenc violenc injustic ( crook guard sell nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi ) watch oz may becom comfort uncomfor

In [5]:
#RNN'e girdileri vermeden önce tamamının aynı boyutta olması gerekli. Bu sebeple aşağıdaki matematiksel işlemleri yapıyoruz.

num_tokens = np.array([len(tokens) for tokens in data_tokens])
#print(np.mean(num_tokens))
#print(np.std(num_tokens))
#print(np.max(num_tokens))
#print(np.min(num_tokens))

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens) # np.std = standart sapma
max_tokens = int(max_tokens)
print(max_tokens)
#Verinin ne kadarını bu kapsama aldığımızın ölçülmesi
print("%", round(np.sum(num_tokens < max_tokens) / len(num_tokens) * 100, 2))

293
% 94.52


In [6]:
#Padding işlemi. Bulunan uzunluk değerine göre cümlelerin yeniden düzenlenmesi. Kısa olanların başına sıfır eklenmesi.
#Uzun olanlardan baştan silme yapılması
data_pad = pad_sequences(data_tokens, maxlen=max_tokens)

In [7]:
#Modeli oluşuran fonksiyon, KerasClassifier oluşturmak için gerekli
def create_model():
    #RNN oluşturma, ardışık bir model
    model = Sequential()
    
    #her kelimeye karşılık gelen 50 uzunluğunda bir vektör oluşturulur. (Embedding matrisi)
    embedding_size = 50
    
    #matris kelime sayısı ve embedding büyüklüğünde olacak, yani 10bine 50 uzunluğunda 
    model.add(Embedding(input_dim=num_words,
                        output_dim=embedding_size,
                        input_length=max_tokens,
                        name='embedding_layer'))
    #LSTM layerlerinin eklenmesi
    # 16 nöronlu LSTM (16 outputlu , return_sequences=True demek output'un tamamını ver demek)
    model.add(GRU(units=16, return_sequences=True))
    # 8 nöronlu LSTM (8 outputlu , return_sequences=True demek output'un tamamını ver demek)
    model.add(GRU(units=8, return_sequences=True))
    # 4 nöronlu LSTM (4 outputlu , return_sequences=False yani default değer, tek bir output verecek)
    model.add(GRU(units=4))
    # Tek bir nörondan oluşan output layer'ı
    model.add(Dense(1, activation='sigmoid'))

    #modelin derlenmesi 
    #iki sınıf olduğu için loss fonksiyonu olarak binary_crossentropy 
    #modelin başarısını görmek için accuracy metrics
    #optimizasyon algoritması
    optimizer = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy','Precision','Recall',])
    return model

In [8]:
# Modelin değerlendirilmesi
seed=0
#hold-out 
x_train, x_test, y_train, y_test = train_test_split(data_pad, target, test_size=0.1, random_state=seed)
model=create_model()
model.fit(x_train, y_train, epochs=5, batch_size=256, verbose=0)

#Evaluate fonksiyonu yalnızca accuracy ve loss değerini döndürür
result = model.evaluate(x_test, y_test)

import statistics
dizi = [result[2],result[3]]

print("Accuracy= ",result[1])
print("Precision= ",result[2])
print("Recall= ",result[3])
print("F-measure= ",statistics.harmonic_mean(dizi))

Accuracy=  0.8876000046730042
Precision=  0.8783621191978455
Recall=  0.8941560983657837
F-measure=  0.8861887428539822


In [9]:
# Modelin oluşturulması
# Model eğitimi, bir defa eğitimden geçmesi -> epoch , batch_size -> 256'şar 256'şar beslenecek.
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=256, verbose=0)

In [10]:
#k-fold cross validation
scoring = ['accuracy', 'precision','recall','f1']
kfold = KFold(n_splits=10, shuffle=False, random_state=seed)
results = cross_validate(model, data_pad, target, cv=kfold, n_jobs=-1, scoring=scoring)

print("Accuracy= ",results['test_accuracy'].mean()," Standart Deviation= ", results['test_accuracy'].std())
print("Precision= ",results['test_precision'].mean()," Standart Deviation= ", results['test_precision'].std())
print("Recall= ",results['test_recall'].mean()," Standart Deviation= ", results['test_recall'].std())
print("F-measure= ",results['test_f1'].mean()," Standart Deviation= ", results['test_f1'].std())

Accuracy=  0.8865999999999999  Standart Deviation=  0.004955804677345533
Precision=  0.8861867033498539  Standart Deviation=  0.01408957500144965
Recall=  0.8876150295862228  Standart Deviation=  0.018960198608535428
F-measure=  0.8866258515354127  Standart Deviation=  0.005897973810504174


In [11]:
#stratified k-fold validation
skfold = StratifiedKFold(n_splits=10, shuffle=False, random_state=seed)
results = cross_validate(model, data_pad, target, cv=skfold, n_jobs=-1, scoring=scoring)

print("Accuracy= ",results['test_accuracy'].mean()," Standart Deviation= ", results['test_accuracy'].std())
print("Precision= ",results['test_precision'].mean()," Standart Deviation= ", results['test_precision'].std())
print("Recall= ",results['test_recall'].mean()," Standart Deviation= ", results['test_recall'].std())
print("F-measure= ",results['test_f1'].mean()," Standart Deviation= ", results['test_f1'].std())

Accuracy=  0.8847799999999999  Standart Deviation=  0.004423075852842682
Precision=  0.8818924860046815  Standart Deviation=  0.013301285355943627
Recall=  0.8891599999999998  Standart Deviation=  0.019947491070307567
F-measure=  0.8852199292697767  Standart Deviation=  0.005404550231402596


In [None]:
"""
#leave-one-out cross validation
#Don’t Use LOOCV: Large datasets or costly models to fit(e.g. neural networks).
loo = LeaveOneOut()
results = cross_val_score(model, data_pad, target, cv=loo, n_jobs=-1)
print(results.mean())
"""