In [11]:
pip install textdistance

Collecting textdistance
  Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
import textdistance  

In [5]:
import re
from collections import defaultdict

# Load bigram data from a text file
# Load bigram data from a text file
bigram_data = {}
with open('/kaggle/input/tscorpus-bigram-freq-dict/bigram_freq_dist.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:
            # Use a regex pattern to extract bigram and count
            match = re.match(r"\('(.+)', '(.+)'\): (\d+)", line)
            if match:
                bigram = (match.group(1), match.group(2))
                count = int(match.group(3))
                bigram_data[bigram] = count

In [6]:
# Preprocess the input text
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zğüşıöç ]', '', text)  # Remove non-alphanumeric characters
    words = text.split()
    return words

In [26]:
def generate_candidates(word):
    alphabet = "abcçdefgğhıijklmnoöprsştuüvyz"
    candidates = set()

    # Generate candidates by insertion
    for i in range(len(word) + 1):
        for char in alphabet:
            candidates.add(word[:i] + char + word[i:])

    # Generate candidates by deletion
    for i in range(len(word)):
        candidates.add(word[:i] + word[i+1:])

    # Generate candidates by substitution
    for i in range(len(word)):
        for char in alphabet:
            candidates.add(word[:i] + char + word[i+1:])

    # Generate candidates by transposition
    for i in range(len(word) - 1):
        candidates.add(word[:i] + word[i+1] + word[i] + word[i+2:])

    return list(candidates)

In [28]:
# Function to score a candidate based on bigram frequency
def score_candidate(candidate, context):
    bigram = (context, candidate)
    return bigram_data.get(bigram, 0)

In [32]:
def spell_check(input_text, threshold=0):
    words = preprocess(input_text)
    corrected_text = []

    for word in words:
        if word in bigram_data:  # Check if the word is in the bigram data (unigram check)
            corrected_word = word
        else:
            # Generate candidate corrections
            candidates = generate_candidates(word)
            
            # Score candidates based on bigram frequency with context
            context = corrected_text[-1] if corrected_text else ''
            scores = {candidate: score_candidate(candidate, context) for candidate in candidates}
            
            # Filter candidates by the threshold
            filtered_candidates = [candidate for candidate in candidates if scores[candidate] >= threshold]
            
            if filtered_candidates:
                # Choose the candidate with the highest score among filtered candidates
                corrected_word = max(filtered_candidates, key=lambda candidate: scores[candidate], default=word)
            else:
                # If no candidates meet the threshold, keep the original word
                corrected_word = word
        
        corrected_text.append(corrected_word)

    return ' '.join(corrected_text)

In [126]:
input_text = "müşteri vri kümesi banka tarafından hizmet verilen veya  hizmet alma potansiyeli olan tüm gerçek ve tüzel müşterileri ve bu müşterileri tanımlayan verileri kapsamaktadır"
corrected_text = spell_check(input_text, threshold=3)
print(corrected_text)

müşteri veri kümesi banka tarafından hizmete verilen veya hizmet almak potansiyeli olan tüm gerçek ve güzel müşterileri de bu müşterilerin tanımlayan verileri kapsamaktadır


In [103]:
input_text = "yönetim raporlar dışındaki tüm üye işyeri verisi"
corrected_text = spell_check(input_text, threshold=3)
print(corrected_text)

yönetim raporları dışındaki tüm üye işyeri verisi


In [102]:
input_text = "teknoloji desteğ ile "
corrected_text = spell_check(input_text, threshold=3)
print(corrected_text)

teknoloji desteği ile


In [125]:
input_text = "iç yönetim yapılrı"
corrected_text = spell_check(input_text, threshold=3)
print(corrected_text)

iç yönetim yapıları
