In [24]:
# import the libraries
import nltk
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# download nltk resources for query preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from spellchecker import SpellChecker
from gensim.models import Word2Vec
import gensim.downloader
import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshithabhashetty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshithabhashetty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akshithabhashetty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


B Requirements

In [13]:
# initialize Porter stemmer
stemmer = PorterStemmer()

# initialize the lemmatizer
lemmitizer = WordNetLemmatizer()

In [6]:
# create a set of stopwords
stop_words = set(stopwords.words('english'))

# remove a small subset of descriptive words considered as stopwords
# could be important in understanding context/intent in query
stop_words -= {'like', 'with', 'without', 'for', 'similar', 'to', 'in', 'by', 'as'}

In [7]:
# flatten feature values to terms mapping (feature_mapping)

In [8]:
# remove stopwords from tokenized query
def remove_stopwords(tokens, stopwords) :
    filtered_tokens = []
    # iterate through tokens and keep words that are not in the stopwords
    for token in tokens:
        if token not in stopwords:
            filtered_tokens.append(token)

In [12]:
# textually normalizes the tokens (removes suffixes or finds root words)
def textually_normalize_tokens(tokens, normalizer, stem=True):
    normalized_tokens = []
    # iterate through token and store stemmed token
    for token in tokens:
        # if it is a stemmer, then stem otherwise lemmatize
        if stem:
            normalized_tokens.append(normalizer.stem(token))
        else:
            normalized_tokens.append(normalizer.lemmatize(token))

In [14]:
# preprocess query 
def preprocess_query(query, normalizer, stem=True):
    # lowercase the query and then tokenize using nltk word tokenizer
    query_tokens = word_tokenize(query.lower())

    # remove stopwords from the query tokens
    filtered_tokens = remove_stopwords(query_tokens)

    # normalize tokens with lemmatizer or stemmer
    normalized_tokens = textually_normalize_tokens(filtered_tokens, normalizer, stem)

    return normalized_tokens


In [16]:
# spell checking the query
def autocorrect_query(tokens):
    spell = SpellChecker()
    # iterate through the tokens and if the token is mispelled (not seen in spell), 
    # then correct and replace in the list of tokens
    for index, token in enumerate(tokens):
        if token not in spell:
            tokens[index] = spell.correction(token)
    return tokens

In [22]:
# load the pre-trained word2vec model, potentially fine-tune later
word2vec_model = gensim.downloader.load('word2vec-google-news-300')



In [23]:
# find similar words from the word2vec model
def add_similar_query_tokens(tokens, model):
    similar_terms = []
    # for each token in the query, find 2 similar terms and append
    for token in tokens:
        # find the 2 most similar words to this token
        similar_terms_for_token = model.most_similar(token, 3)
        final_similar_terms_for_token = []
        # remove any terms that are too different (low similarity)
        for term, similarity_score in similar_terms_for_token:
            if len(final_similar_terms_for_token) < 2:
                if term not in tokens and similarity_score >= 0.7:
                    final_similar_terms_for_token.append(term)
        # append the similar terms for this token to the overall list
        similar_terms.extend(final_similar_terms_for_token)
    # return the similar terms for this query
    return similar_terms