# Text Similarity Models
This script builds models to assess the similarity between two pieces of text. This will be used in the SimplifyIT app to ensure that generated simplified text accurately captures the meaning of the input text.

In [None]:
# Import packages for evaluating text similarity
import pandas as pd
import spacy
import os

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# Download nltk resources for tokenization and part of speech tagging
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

# Locations of clean and raw data
ClnDat = '../data_clean/'
RawDat = '../data_raw/'

# Location to save trained models
TrnMod = '../trained_models/'

In [None]:
# Define functions to process data for modeling

# Function takes a list of words and returns a list in which all stop words are removed
def remove_stop_words(wordlist):
    # Get all English stop words
    stops = set(stopwords.words("english"))  
    nostops = [w for w in wordlist if w not in stops]
    return nostops

# Function takes a list of words and returns a list of word stems
def stem_words(wordlist):
    # Initialize object to stem words
    ps = PorterStemmer()
    stems = [ps.stem(w) for w in wordlist]
    return stems

In [None]:
# Load wikipedia sentence text
SentDF = pd.read_csv(ClnDat + 'wiki_sentence.csv').drop(columns = ['same'])

# Melt data
SentDF = SentDF.melt(id_vars = ['sent_id', 'topic'])
SentDF.columns = ['name', 'topic', 'level', 'text']

# Subset data
SentDF = SentDF[['name', 'level', 'text']]

# Remove numbers and symbols and convert string to lower
SentDF['text_c'] = SentDF['text'].str.replace(r'[^a-zA-Z\s+]', '').str.lower()

# Replace double spaces with single space
SentDF['text_c'] = SentDF['text_c'].str.replace(r'\s+\s+', ' ')

# Tokenize text entries
SentDF['text_c'] = SentDF['text_c'].apply(word_tokenize)

# Stem text entries
SentDF['text_c'] = SentDF['text_c'].apply(stem_words)

# Remove stop words
SentDF['text_c'] = SentDF['text_c'].apply(remove_stop_words)

In [None]:
# Tag and label each document/sentence in the corpus
tagged_data = [TaggedDocument(words = doc, tags = [i]) for i, doc in enumerate(SentDF['text_c'])]

In [None]:
# Train doc2vec model on wikipedia sentence data set

# Specify model parameters
max_epochs = 10
vec_size = 50
alpha = 0.025

# Define model
model = Doc2Vec(size = vec_size,
                alpha = alpha, 
                min_alpha = 0.00025,
                min_count = 1,
                dm = 1)

# Add vocabulary to model
model.build_vocab(tagged_data)

# Train model
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# Save model to trained_models directory
model.save(TrnMod + "d2v.model")
print("Model Saved")

In [None]:
# Load trained model
model = Doc2Vec.load(TrnMod + "d2v.model")

In [None]:
SentComp = SentDF[SentDF['name'].isin([3, 4, 5, 6])].reset_index()['text_c']

In [None]:
# Calculate cosine similarity between two entries
model.wv.n_similarity(SentComp[1], SentComp[5])

In [None]:
SentDF = pd.read_csv(ClnDat + 'OSE_TextFeat.csv')

SentDF['tok_adv'] = SentDF['text'].apply(word_tokenize)
SentDF['tok_adv'] = [i for i in SentDF['tok_adv']]
# SentDF['tok_ele'] = SentDF['text_ele'].apply(word_tokenize)
# SentDF['pos_adv'] = SentDF['tok_adv'].apply(nltk.pos_tag)
SentDF.head()

# SentDF['text_adv'] = SentDF['text_adv'].str.replace(r'[^a-zA-Z\s+]', '').str.replace(r'\s+\s+', ' ').str.lower()

# SentDF['text_ele'] = SentDF['text_ele'].str.replace(r'[^a-zA-Z\s+]', '').str.replace(r'\s+\s+', ' ').str.lower()

# SentDF['tok_adv'] = [t.split(' ') for t in SentDF['text_adv']]
# SentDF['tok_adv'] = SentDF['tok_adv'].apply(list)
# SentDF['tok_ele'] = [t.split(' ') for t in SentDF['text_ele']]

In [None]:
sentences = SentDF['text_c']
model = Word2Vec(sentences, min_count=5, window=5, size=10)

In [None]:
wordcheck = 'government'
model.wv[wordcheck]

In [None]:
model.wv.similar_by_word(wordcheck, topn=10)