## Similar sentences generator.

- This notebook tries to generate similar sentences to a given input sentence. 
- It makes use of WordNet and GloVe embeddings to arrive at substitute words for candidate words in a sentence.
- Similarity threshold, and number of sentences generated can be controlled.

#### Load Glove Vectors 

In [31]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
# path to glove vectors
glove_path = "../Word2vec/gensim_glove.6B.50d.txt"

In [None]:
# load GloVe vectors
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False)

In [None]:
# fetch the vocab
vocab = glove_model.wv

In [None]:
# check the GloVe vectors
len(vocab.vocab)

#### NLTK + WordNet

In [None]:
# nltk is used to perform POS tagging
import nltk 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

Utility method to return "simple" POS tag for a given POS tag.

In [None]:
def fetch_pos_identity(pos_tag):
    
    '''
    This method returns
    
    1. 'np' for proper nouns, 'n' for all other nouns
    
    2. 'a' for adjectives
    
    3. 'v' for verbs
    
    4. 'r' for adverbs
    
    5. None for all other tags
    '''
    
    if pos_tag in ['NNP', 'NNPS']:
        return 'np'
    elif pos_tag in ['NN', 'NNS']:
        return 'n'
    elif pos_tag in ['JJ', 'JJR', 'JJS']:
        return 'a'
    elif pos_tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 'v'
    elif pos_tag in ['RB', 'RBR', 'RBS']:
        return 'r'
    else:
        return None

Function to return most similar words.

In [None]:
def get_related_words(word, pos_tag, similarity_threshold):
    
    '''
    This method returns most similar words to the word passed.
    
    args:
    
    word = input word
    pos_tag = Simple POS tag of the word
    similarity_threshold (float) = Value between 0 and 1. Indicates the similarity threshold to consider
    
    returns:
    
    a list of similar words, along with the original word
    '''

    # Lemmatize the word
    word = lemmatizer.lemmatize(word, pos_tag)
    # Get the synonyms and antonyms of a word
    synonyms = [word] 
    #antonyms = [] 
    
    
    try:
        vector_check = glove_model.wv.get_vector(word)
    except:
        # If the word does not exist in the Glove model, return
        return synonyms

    for syn in wordnet.synsets(word): 
    
        for l in syn.lemmas():
        
            try:
            
                if l.name() in synonyms:
                    continue
                
                # Get the vector of the synonym
                vector_prospect = glove_model.wv.get_vector(l.name())
            
                #print('Checking word = ', l.name())
                cosine_diff = vocab.cosine_similarities(vector_1=vector_check, vectors_all=[vector_prospect])
                #print(cosine_diff)
            
                #similar_by_vector()words_closer_than()n_similarity()
                if cosine_diff > similarity_threshold:
                    synonyms.append(l.name()) 
            
            except:
                
                pass
        
            #if l.antonyms(): 
             #   antonyms.append(l.antonyms()[0].name()) 
            
    return synonyms

#### Utility Functions

In [None]:
def get_next_position(total_synonym_array, position_array, last_position):
    
    '''
    
    This method returns the next position of word replacement.
    
    args:
    
    total_synonym_array = Array containing the total length of synonyms
    position_array = Array containing current positions
    last_position_array = Integer
    
    returns:
    
    next position to be updated, -1 if all positions are exhausted
    '''
    new_pos = last_position
    
    for i in range(len(total_synonym_array)):
        
        # get a new position
        new_pos = (new_pos + 1) % len(total_synonym_array)
        
        # if the new position is not good enough, fetch a new one
        if position_array[new_pos] == -1 or position_array[new_pos] == total_synonym_array[new_pos]:
            continue
        else:
            return new_pos

    return -1

In [None]:
def get_position_arrays(sentence_combination):
    
    '''
    This is a utility method to get position arrays.
    
    args:
    
    sentence_combination = [[word], [word1, word2, ]]
    
    returns:
    
    two position arrays
    '''
    total_synonym_array = []
    initial_position_array = []
    
    for each_word_array in sentence_combination:
        length = len(each_word_array)
        total_synonym_array.append(length)
        if length == 1:
            initial_position_array.append(-1)
        else:
            initial_position_array.append(0)
    
    return total_synonym_array, initial_position_array

#### Function to provide an alternate sentence

In [None]:
def provide_alternate_sentence(sentence, num_versions=1, max_changes=1, similarity_threshold=0.7, ignore_stopwords=True, ignore_proper_nouns=True):
    
    '''
    This method returns an alternate version(s) of the sentence passed by replacing words with their closest synonyms.
    
    args:
    
    sentence (String) = the input sentence
    num_versions (int) = the number of alternate versions required
    max_changes (int) = the maximum number of changes between versions
    similarity_threshold (float) = Value between 0 and 1. Indicates the similarity threshold to consider while replacing words
    ignore_stopwords (bool) = If True, stopwords will not be considered for replacement
    ignore_proper_nouns (bool) = If True, proper nouns will be ignored for replacement
    
    returns:
    
    list of alternate sentence(s)
    '''
    
    alternate_sentences = []
    
    sentence_combination = []
    
    # split the sentence into words
    words = sentence.split()
    
    # pos tag the sentence
    pos_tags = nltk.pos_tag(words)
    
    
    for each_word_pos in pos_tags:
        
        word = each_word_pos[0]
        pos_tag = each_word_pos[1]
        short_pos = fetch_pos_identity(pos_tag)
        
        # ignore proper nouns
        if ignore_proper_nouns and 'np' == short_pos:
            sentence_combination.append([word])
            continue
        
        # lemmatize the word
        if short_pos is not None:
            word_lemmatized = lemmatizer.lemmatize(word, short_pos)
        else:
            word_lemmatized = lemmatizer.lemmatize(word)
        
        # ignore stopwords
        if ignore_stopwords and (word_lemmatized in stop_words or word in stop_words):
            sentence_combination.append([word])
            continue
        
        # if POS is noun, adj, adv, or verb - get similar words
        if short_pos is not None:
            sentence_combination.append(get_related_words(word, short_pos, similarity_threshold))
        # else do nothing
        else:
            sentence_combination.append([word])
            continue
    
    total_synonym_array, position_array = get_position_arrays(sentence_combination)
    
    total_combos_possible = 0
    for some_value in total_synonym_array:
        if some_value > 1:
            total_combos_possible = total_combos_possible + some_value
    
    total_combos_possible = total_combos_possible - 1
    
    last_position = -1
    
 
    for i in range(num_versions):
        
        if i >= total_combos_possible:
            break
        
        # get the position to replace
        position = get_next_position(total_synonym_array, position_array, last_position)
        
        #print(position)
        
        if position == -1:
            break
        
        alt_sentence = ''
        counter = 0
        for j in sentence_combination:
            alt_sentence = alt_sentence + ' '
            if counter == position:
                alt_sentence = alt_sentence + j[position_array[position]] # ] + 1 ]
                position_array[position] = position_array[position] + 1
                
                last_position = position
                
            else:
                if position_array[counter] > -1:
                    alt_sentence = alt_sentence + j[position_array[counter] - 1]
                else:
                    alt_sentence = alt_sentence + j[position_array[counter]]
                
            
            counter = counter + 1
        
        alt_sentence = alt_sentence.strip()
        alternate_sentences.append(alt_sentence)
            
    return alternate_sentences

#### Test the Generator

In [None]:
provide_alternate_sentence('We collect your information regularly', num_versions=4, similarity_threshold=0.60)