In [4]:
# ========================================
# Autocomplete & Autocorrect - Oasis Infobyte Task
# Fixed Version with punkt_tab
# ========================================

# Step 1: Install required libraries
!pip install nltk textdistanceg

# Step 2: Import libraries
import nltk
import re
import textdistance
from collections import Counter, defaultdict
from nltk.util import ngrams

# Step 3: Download NLTK data (fixed)
nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')  # <-- Fix for ValueError

# Step 4: Load a sample text dataset (Reuters corpus)
from nltk.corpus import reuters
corpus_text = " ".join([" ".join(sent) for sent in reuters.sents()])

# Step 5: Preprocess text
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    tokens = nltk.word_tokenize(text)
    return tokens

tokens = preprocess(corpus_text)
print(f"Sample tokens: {tokens[:20]}")
print(f"Total tokens: {len(tokens)}")

# Step 6: Build word frequency dictionary for autocorrect
word_freq = Counter(tokens)

# Step 7: Autocorrect function
def autocorrect(word):
    if word in word_freq:
        return word
    candidates = [(w, textdistance.levenshtein(word, w)) for w in word_freq.keys()]
    candidates = sorted(candidates, key=lambda x: (x[1], -word_freq[x[0]]))
    return candidates[0][0] if candidates else word

# Step 8: Build bigram model for autocomplete
bigram_model = defaultdict(Counter)
for w1, w2 in ngrams(tokens, 2):
    bigram_model[w1][w2] += 1

def autocomplete(word, top_n=5):
    if word not in bigram_model:
        return []
    suggestions = bigram_model[word].most_common(top_n)
    return [w for w, _ in suggestions]

# Step 9: Phrase-based autocomplete
def autocomplete_phrase(phrase, top_n=5):
    words = preprocess(phrase)
    if not words:
        return []
    last_word = words[-1]
    completions = autocomplete(last_word, top_n)
    return [phrase + " " + c for c in completions]

# Step 10: Tests
print("\n--- Autocorrect Tests ---")
print("hte ->", autocorrect("hte"))
print("recieve ->", autocorrect("recieve"))

print("\n--- Autocomplete Tests ---")
print("government ->", autocomplete("government"))
print("\nPhrase autocomplete for 'the government':")
print(autocomplete_phrase("the government"))

# Step 11: Interactive Mode
while True:
    user_input = input("\nEnter a word or phrase (or 'exit' to stop): ").lower()
    if user_input == 'exit':
        break
    print("Autocorrect (last word):", autocorrect(user_input.split()[-1]))
    print("Autocomplete (next word):", autocomplete(user_input.split()[-1]))
    print("Autocomplete phrase:", autocomplete_phrase(user_input))




[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sample tokens: ['asian', 'exporters', 'fear', 'damage', 'from', 'u', 's', 'japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'u', 's', 'and', 'japan', 'has', 'raised']
Total tokens: 1330383

--- Autocorrect Tests ---
hte -> he
recieve -> recieve

--- Autocomplete Tests ---
government -> ['s', 'to', 'has', 'securities', 'officials']

Phrase autocomplete for 'the government':
['the government s', 'the government to', 'the government has', 'the government securities', 'the government officials']

Enter a word or phrase (or 'exit' to stop): asain
Autocorrect (last word): again
Autocomplete (next word): []
Autocomplete phrase: []

Enter a word or phrase (or 'exit' to stop): officials
Autocorrect (last word): officials
Autocomplete (next word): ['said', 'have', 'of', 'told', 'in']
Autocomplete phrase: ['officials said', 'officials have', 'officials of', 'officials told', 'officials in']

Enter a word or phrase (or 'exit' to stop): told
Autocorrect (last word): told
Autocompl