This notebook describes a classifier for whether the Turkish “de/da” and “ki” suffixes should be
separated or not.

For example, in “Öğrenciler de geldi” it is separated, but in “Öğrencilerde gelişme
var.” it is not separated.

# Imports
*   Import necessary libraries



In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np  # Import NumPy for array manipulation

# Providing the Data

In [2]:
# Read the annotated file
with open("annotated_sentences_for_de.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Data pre-processing
sentences = []
labels = []

for line in lines:
    parts = line.strip().split()
    if len(parts) >= 2:
        sentences.append(parts[0])
        labels.append(parts[-1])

# Encode labels into numerical values
label_mapping = {"O": 0, "B-ERR": 1}
labels = [label_mapping[label] for label in labels]

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences)

# Convert labels to numpy arrays
labels = np.array(labels)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


# Define and Compile the Neural Network

In [None]:
batch_size = 16
learning_rate = 0.2
hidden_size = 100

# Model architecture with optimized hyperparameters
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=total_words, output_dim=16, input_length=len(padded_sequences[0])),
    tf.keras.layers.LSTM(hidden_size, return_sequences=True),  # Use return_sequences=True for stacking LSTM layers
    tf.keras.layers.LSTM(hidden_size),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with the chosen learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Training the Neural Network
Tooks approximately 40 minutes for 20 epoch, 120 second for each epoch (on T4 GPU)

In [4]:
# Train the model with the chosen batch size
model.fit(X_train, y_train, epochs=20, batch_size=batch_size, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78bf1e5c9c60>

# Save the Model

In [5]:
import pickle

# Specify the path where you want to save the model
model_save_path = "/content/custom_model/model.h5"

# Save the model
model.save(model_save_path)

# Optionally, you can also save the tokenizer if you need it for later use
tokenizer_save_path = "/content/custom_model/tokenizer.pickle"
with open(tokenizer_save_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


  saving_api.save_model(


# Test the Model

In [33]:
from keras.models import load_model

# Load the model
loaded_model = load_model(model_save_path)

# Optionally, load the tokenizer
with open(tokenizer_save_path, 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

# Use the trained model for predictions
new_sentence = "Emekli Albay Yıldırım Taşyumruk birazda mesleği gereği ömrü boyu sert ve otoriter bir baba olmuştur."
print(f"Input Sentence: {new_sentence}")
new_sequence = loaded_tokenizer.texts_to_sequences([new_sentence])
new_padded_sequence = pad_sequences(new_sequence, maxlen=len(padded_sequences[0]))

# Assuming the model.predict() returns a single value
predicted_value = loaded_model.predict(new_padded_sequence)[0][0]

# Set a threshold (e.g., 0.5)
threshold = 0.0034501181318774e-06

# Classify based on the threshold
predicted_class = "Error DETECTED! It is a conjunction! " if predicted_value > threshold else "Correct Sentence :)"

print(f"Predicted class: {predicted_class}")
# print(f"Predicted value: {predicted_value}")



Input Sentence: Emekli Albay Yıldırım Taşyumruk birazda mesleği gereği ömrü boyu sert ve otoriter bir baba olmuştur.
Predicted class: Error DETECTED! It is a conjunction! 


In [32]:
from keras.models import load_model

# Load the model
loaded_model = load_model(model_save_path)

# Optionally, load the tokenizer
with open(tokenizer_save_path, 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)


# Load and predict on sentences from "test_sentences_de.txt"
test_sentences_path = "test_sentences_de.txt"

with open(test_sentences_path, 'r', encoding='utf-8') as file:
    test_sentences = file.readlines()

# Set a threshold (e.g., 0.0034501181318774e-06)
threshold = 0.00119469075778033584

for sentence in test_sentences:
    print(f"Input sentence: {sentence}")
    # Tokenize the sentence
    tokens = sentence.split()

    # Predict for each word in the sentence
    for token in tokens:
        if token.endswith(("de", "da", "te", "ta")):
            print(f"Inspecting Word: {token}")
            # Tokenize and pad the word
            new_sequence = loaded_tokenizer.texts_to_sequences([token])
            new_padded_sequence = pad_sequences(new_sequence, maxlen=len(padded_sequences[0]))

            # Predict using the loaded model
            predicted_value = loaded_model.predict(new_padded_sequence)[0][0]  # Extract the value from the numpy array
            print(f"Predicted value: {predicted_value}")

            # Classify based on the threshold
            if predicted_value > threshold:
                print(f"Suspicious Word: {token}")

    # Print the completed sentence
    print("===")  # Separate output for each sentence


Input sentence: Evde yağ kalmamış.

Inspecting Word: Evde


KeyError: 'Evde'