In [None]:
%pip install nltk

import os
import math
from collections import defaultdict
import re
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    text = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    text_with_soundex = []
    for word in text:
        text_with_soundex.append(word)  # the original word
        soundex_code = soundex(word)    # the Soundex version
        if soundex_code != word:        # avoiding duplicates
            text_with_soundex.append(soundex_code)
    
    return text_with_soundex

In [None]:
def soundex(word):
    if not word:
        return ""
    
    word = word.upper()
    
    # first letter
    soundex_code = word[0]
    
    # replacing consonants with digits
    mapping = {
        'B': '1', 'F': '1', 'P': '1', 'V': '1',
        'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
        'D': '3', 'T': '3',
        'L': '4',
        'M': '5', 'N': '5',
        'R': '6'
    }
    
    # apply mapping
    for char in word[1:]:
        if char in mapping:
            code = mapping[char]
            # Don't add duplicate consecutive codes
            if soundex_code[-1] != code:
                soundex_code += code
    
    # removing vowels
    vowels = 'AEIOUYHW'
    filtered_code = soundex_code[0]  # again keeping first letter
    for char in soundex_code[1:]:
        if char not in vowels:
            filtered_code += char
    
    # adding with zeros or truncate to 4 characters
    filtered_code = (filtered_code + '000')[:4]
    
    return filtered_code