In [90]:
from string import punctuation
from os import listdir
from nltk.corpus import stopwords
from pickle import dump
import pydot
import pydotplus
import graphviz
import re
import string

In [91]:

def load_doc(filename):
    
    file = open(filename, 'r',encoding="utf-8")
    
    text = file.read()
    
    file.close()
    return text

In [92]:
def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('turkish'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [93]:
def clean_text(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('turkish'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [94]:
filename = 'emotion_dataset_v1/pos/cv004_11630.txt'
text = load_doc(filename)
tokens = clean_text(text)
print(tokens[:100])

['Ürün', 'gayet', 'güzel', 'ürün', 'kullanış', 'lı', 'Beklentiyi', 'karşılıyor', 'Ara', 'plastik', 'biraz', 'daha', 'uzun', 'olabilirdi', 'Hemen', 'etiketi', 'çıkıyor', 'ürün', 'şimdilik', 'güzel', 'bantlar', 'çıkmazsa', 'Güzel', 'ürün', 'Ürünü', 'beğendim', 'tavsiye', 'ederim', 'güzel', 'alışveriş', 'güzel', 'ürün', 'teşekkürler', 'ürün', 'resmen', 'çöp', 'boşa', 'para', 'vermeyin', 'bantlat', 'yapışmıyor', 'bile', 'aylık', 'kızım', 'dokunmadan', 'bantlar', 'kendiliğinden', 'düştü', 'parama', 'yazık', 'oldu', 'Ürün', 'gayet', 'güzel', 'yaşında', 'yeğenim', 'için', 'almıştım', 'şuan', 'çekmece', 'dolapları', 'açamıyor', 'yapışkanı', 'silikonu', 'güzel', 'fiyatıda', 'çok', 'iyi', 'Super', 'tam', 'resimde', 'ki', 'gibi', 'Super', 'hızlı', 'geldi', 'tam', 'resimde', 'ki', 'gibi', 'Güzel', 'kaliteli', 'Çok', 'Memnunum', 'Güzel', 'hafif', 'bir', 'ürün', 'Pofuduk', 'bir', 'mont', 'istiyordum', 'Nike', 'veya', 'Adidas', 'tarzı', 'bu', 'mont', 'daha', 'hafifAma']


In [95]:
def process_docs(directory, is_trian):
    documents = list()
    for filename in listdir(directory):
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

In [96]:

def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [99]:
def train_pkl():
    negative_docs = process_docs('emotion_dataset_v1/neg', True)
    positive_docs = process_docs('emotion_dataset_v1/pos', True)
    trainX = negative_docs + positive_docs
    trainy = [0 for _ in range(10)] + [1 for _ in range(10)]
    save_dataset([trainX,trainy], 'train.pkl')
def Test_pkl():
    negative_docs = process_docs('txt_sentoken/neg', True)
    positive_docs = process_docs('txt_sentoken/pos', True)
    testX = negative_docs + positive_docs
    testY = [0 for _ in range(10)] + [1 for _ in range(10)]
    save_dataset([testX,testY], 'test.pkl')
train_pkl()
Test_pkl()

Saved: train.pkl
Saved: test.pkl


In [100]:
from pickle import load
import keras
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

In [101]:
def load_dataset(filename):
    return load(open(filename, 'rb'))

In [102]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [103]:
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [104]:

def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [105]:
def define_model(length, vocab_size):
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2) (drop1)
    flat1 = Flatten()(pool1)

    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2) (drop2)
    flat2 = Flatten()(pool2)
 
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)

    merged = concatenate([flat1, flat2, flat3])
   
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    print(model.summary())
    plot_model(model, show_shapes=True, to_file='multichannel.png')
    return model

In [106]:
# load training dataset
trainLines, trainLabels = load_dataset('train.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

Max document length: 40587
Vocabulary size: 53947
(20, 40587)


In [107]:
# define model
model = define_model(length, vocab_size)
# fit model
model.fit([trainX,trainX,trainX], array(trainLabels), epochs=5, batch_size=15)
# save the model
model.save('model.h5')

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 40587)        0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 40587)        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 40587)        0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 40587, 100)   5394700     input_16[0][0]                   
____________________________________________________________________________________________

In [108]:
# load datasets
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
print(trainX.shape, testX.shape)

Max document length: 40587
Vocabulary size: 53947
(20, 40587) (20, 40587)


In [109]:
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

In [110]:
# load a clean dataset
def load_dataset(filename):return load(open(filename, 'rb'))

In [111]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [112]:
# calculate the maximum document length
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [113]:
# encode a list of lines
def encode_text(tokenizer, lines, length):
# integer encode
    encoded = tokenizer.texts_to_sequences(lines)
    # pad encoded sequences
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [114]:
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
print('Max document length: %d' % length)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)
# load the model
model = load_model('model.h5')
# evaluate model on training dataset
_, acc = model.evaluate([trainX,trainX,trainX], trainLabels, verbose=0)
print('Train Accuracy: %.2f' % (acc*100))
# evaluate model on test dataset dataset
_, acc = model.evaluate([testX,testX,testX], testLabels, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

Max document length: 40587
Vocabulary size: 53947
Train Accuracy: 100.00
Test Accuracy: 50.00
