This notebook describes a classifier for whether the Turkish “de/da” and “ki” suffixes should be
separated or not.

For example, in “Öğrenciler de geldi” it is separated, but in “Öğrencilerde gelişme
var.” it is not separated.

# Imports
*   Import necessary libraries



In [11]:
import numpy as np
import tensorflow
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Providing the Data

In [12]:
# Read the annotated file
with open("annotated_sentences_for_de.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Data pre-processing
sentences = []
labels = []

for line in lines:
    parts = line.strip().split()
    if len(parts) >= 2:
        sentences.append(parts[0])
        labels.append(parts[-1])


# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

# Convert sentences and labels to sequences
X = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(X)
y = np.array([1 if 'B-ERR' in label else 0 for label in labels])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define and Compile the Neural Network

In [13]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training the Neural Network

In [14]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.03977125138044357, Test Accuracy: 0.9848034977912903


# Save the Model

In [15]:
import pickle

# Specify the path where you want to save the model
model_save_path = "/content/custom_model/model2.h5"

# Save the model
model.save(model_save_path)

# Optionally, you can also save the tokenizer if you need it for later use
tokenizer_save_path = "/content/custom_model/tokenizer2.pickle"
with open(tokenizer_save_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


  saving_api.save_model(


# Test the Model

In [20]:
from keras.models import load_model

# Load the model
loaded_model2 = load_model(model_save_path)

# Optionally, load the tokenizer
with open(tokenizer_save_path, 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)


# Load and predict on sentences from "test_sentences_de.txt"
test_sentences_path = "test_sentences_de.txt"

with open(test_sentences_path, 'r', encoding='utf-8') as file:
    test_sentences = file.readlines()

treshold = 3.5015692567696988e-09

# Predictions
for sentence_to_predict in test_sentences:
  print(f'\nInput Sentence: {sentence_to_predict}')
  X_new = tokenizer.texts_to_sequences([sentence_to_predict])
  X_new = pad_sequences(X_new, maxlen=X.shape[1])

  prediction = model.predict(X_new)[0][0]
  prediction_label = 'Error Detected! Separate it!' if prediction < treshold else 'Correct :)'

  print(f'Prediction: {prediction_label} (Probability: {prediction})')


Input Sentence: Evde yağ kalmamış.

Prediction: Correct :) (Probability: 5.127697022544453e-08)

Input Sentence: Bende gelmek istiyordum.

Prediction: Error Detected! Separate it! (Probability: 1.763664103648921e-09)

Input Sentence: İnsanın bazen çıkıp gezmeside gerekiyor.

Prediction: Error Detected! Separate it! (Probability: 2.192735326644879e-09)

Input Sentence: Anahtarları piknikte kaybetmiş.

Prediction: Correct :) (Probability: 4.3529911408768385e-07)

Input Sentence: Babam geldiğinde gidelim.

Prediction: Correct :) (Probability: 3.5015692567696988e-09)

Input Sentence: Sadece hastalandığımda arıyorsun.

Prediction: Error Detected! Separate it! (Probability: 1.544233407813067e-09)

Input Sentence: Baba benide okula gönder.

Prediction: Correct :) (Probability: 1.6814217573823953e-08)

Input Sentence: Bir şeydende eksik kalsan olmaz mı?

Prediction: Error Detected! Separate it! (Probability: 2.5326956087923236e-09)

Input Sentence: Saatleri uyarsa bende gelebilirim.

Predicti