In [31]:
from nltk.stem import SnowballStemmer

english_stemmer = SnowballStemmer("english")
english_stemmer.stem("running")  # → "run"
from TurkishStemmer import TurkishStemmer

turkish_stemmer = TurkishStemmer()
turkish_stemmer.stem("doktoruymuşsunuz")

"""
In the future, we should use a proper stemmer, like this:
import spacy

# Load the English model
# TODO, we can improve this by actually using the whole english sentence as context
english_nlp = spacy.load('en_core_web_sm')

def english_dict_entry(word):
    output = english_nlp(word)
    if len(output) != 1:
        return None
    return output[0].lemma_

# load the turkish one
import zeyrek

analyzer = zeyrek.MorphAnalyzer()
analysis = analyzer.analyze('benim')
for parse in analysis:
    print(parse)
"""

"\nIn the future, we should use a proper stemmer, like this:\nimport spacy\n\n# Load the English model\n# TODO, we can improve this by actually using the whole english sentence as context\nenglish_nlp = spacy.load('en_core_web_sm')\n\ndef english_dict_entry(word):\n    output = english_nlp(word)\n    if len(output) != 1:\n        return None\n    return output[0].lemma_\n\n# load the turkish one\nimport zeyrek\n\nanalyzer = zeyrek.MorphAnalyzer()\nanalysis = analyzer.analyze('benim')\nfor parse in analysis:\n    print(parse)\n"

In [32]:
# read input file
from collections import Counter

output_dictionary = {}

english_stems_to_forms = {}

with open("tatoeba_complete.tr-en.aligned.words", "rt") as input_file:
    for line in input_file.readlines():
        for word_alignment in line.split():
            from_word, to_word = word_alignment.split("<sep>")
            from_stem = turkish_stemmer.stem(from_word).lower()
            to_stem = english_stemmer.stem(to_word).lower()
            if from_stem is None or to_stem is None:
                continue
            if to_stem == "the":
                continue

            # add it
            if from_stem not in output_dictionary:
                output_dictionary[from_stem] = Counter()
            output_dictionary[from_stem][to_stem] += 1
            # add stems to form
            if to_stem not in english_stems_to_forms:
                english_stems_to_forms[to_stem] = Counter()
            english_stems_to_forms[to_stem][to_word] += 1

# convert from english stems to most probable form
output_dictionary = {
    turkish: {
        english_stems_to_forms[english_stem].most_common(1)[0][0]: frequency
        for english_stem, frequency in sorted(translations.items(), key=lambda a: a[1], reverse=True)
    }
    for turkish, translations in output_dictionary.items()
}

In [30]:
turkish_stem = turkish_stemmer.stem("yan").lower()
search_results = output_dictionary.get(turkish_stem)
print(turkish_stem, search_results)
if search_results is not None:
    total = sum(search_results.values())
    print(total)
    for meaning, frequency in search_results.items():
        if frequency > total * 0.15:
            print(f"{meaning} ({frequency})")

geçirmek {'spend': 9, 'to': 6, 'killed': 3, 'passed': 2, 'regret': 1, 'garden': 1, 'go': 1, 'restored': 1}
24
spend (9)
to (6)


In [26]:
# export to json
import json
with open("tatoeba_head100k.tr-en.dictionary.json", "wt") as output_file:
    json.dump({turkish: [{"english": english, "frequency": frequency} for english, frequency in sorted(translations.items(), key=lambda a: a[1], reverse=True)] for turkish, translations in output_dictionary.items()}, output_file, ensure_ascii=False, indent=2)