# Script in development for SimplifyIT Webapp

In [1]:
# Load packages
import pandas as pd
import numpy as np
import dill
import sklearn
import spacy
import textstat
import gpt_2_simple as gpt2
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec, Doc2Vec
import tensorflow as tf

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
# Input text for demonstration
InputText = (
'''
When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object. In order for a force to qualify as having done work on an object, there must be a displacement and the force must cause the displacement. 
'''
)

In [3]:
# Set pandas to display all of text
pd.set_option('max_colwidth', None)

# File locations
TrnMod = '../trained_models/'

In [4]:
# Function splits text, t, into sentences
def sent_break(t):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(t)
    return doc.sents

# Function counts the number of words in text t
def word_count(t):
    sents = sent_break(t)
    n_words = 0
    for s in sents:
        n_words += len([token for token in s])
    return n_words

# Function counts the number of sentences in text t
def sent_count(t):
    sents = sent_break(t)
    return len(list(sents))

# Load text difficulty 
text_diff_mod = dill.load(open(TrnMod+'text_difficulty.pickle', 'rb'))

In [5]:
# Function evaluates text difficulty: input is text and output is a difficulty level (advanced, elementary, intermediate)
def text_difficulty(t):
    t = [t] # Put text to list
    df = pd.DataFrame(t, columns = ['text']) # Initiate dataframe with text
    df['difficulty'] = df['text'].apply(textstat.flesch_reading_ease) # Flesh reading difficulty
    df['n_sent'] = df['text'].apply(sent_count) # Number of sentences
    df['n_syll'] = df['text'].apply(textstat.syllable_count) # Number of syllables
    df['n_lex'] = df['text'].apply(textstat.lexicon_count) # Number of words
    df['lex_sent'] = df['n_lex']/df['n_sent'] # Word to sentence ratio
    df['syll_lex'] = df['n_syll']/df['n_lex'] # Syllable to word ratio
    df = df[['difficulty', 'lex_sent', 'syll_lex', 'n_syll']] # Reduce to 
    
    level = text_diff_mod.predict(df)[0] # Predict reading difficulty of text
    
    diff_dict = {0:'advanced', 1:'elementary', 2:'intermediate'} # Dictionary stores descriptors of levels
    
    return diff_dict[level]

In [6]:
sent_list = [s for s in sent_break(InputText)] # Split text into sentences
sent_list

[
 When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object.,
 In order for a force to qualify as having done work on an object, there must be a displacement and the force must cause the displacement. ]

In [7]:
# Load Doc2Vec similarity model
similarity_mod = Doc2Vec.load(TrnMod + "d2v.model")

# Function takes a list of words and returns a list in which all stop words are removed
def remove_stop_words(wordlist):
    # Get all English stop words
    stops = set(stopwords.words("english"))  
    nostops = [w for w in wordlist if w not in stops]
    return nostops

# Function takes a list of words and returns a list of word stems
def stem_words(wordlist):
    # Initialize object to stem words
    ps = PorterStemmer()
    stems = [ps.stem(w) for w in wordlist]
    return stems

def text_similarity(t1, t2):
    t1, t2 = str(t1), str(t2) # Convert text entries to strings
    # Initialize data frame to clean and evaluate text 
    t = [t1, t2]
    df = pd.DataFrame(t, columns = ['text'])
    
    # Prep texts for Doc2Vec similarity
    df['text_c'] = df['text'].str.replace(r'[^a-zA-Z\s+]', '').str.lower() # Remove numbers and symbols and convert string to lower
    df['text_c'] = df['text_c'].str.replace('\n', '')
    df['text_c'] = df['text_c'].str.replace(r'\s+\s+', ' ') # Replace double spaces with single space
    
    df['text_c'] = df['text_c'].str.replace('mss', '')
    
    df['text_c'] = df['text_c'].apply(word_tokenize) # Tokenize text entries
    df['text_c'] = df['text_c'].apply(stem_words) # Stem text entries
    df['text_c'] = df['text_c'].apply(remove_stop_words) # Remove stop words
    
    # Calculate cosine simlarity of both sentences
    cos_sim = similarity_mod.wv.n_similarity(df['text_c'][0], df['text_c'][1])
    
    return cos_sim

In [8]:
# Function that calculates a "fit score" for each sentence as a functuion of whether or not meaning is maintained
# and syllables, words, and syllable to word ratio are reduced.
def sentence_fit(gen_text, orig_text):
    df = pd.DataFrame(gen_text, columns = ['generated']) # Text generated from GPT2 stored in dataframe
    df['generated'] = df['generated'].str.replace(r' +,', ',').str.replace(r' +\.', '.') # Remove spaces in front of punctuation
    df['similarity'] = df['generated'].apply(lambda x: text_similarity(orig_text, x)) # Assess cosine similarity betweeen sentences
    df['n_syll'] = df['generated'].apply(textstat.syllable_count) # Count number of syllables
    df['n_lex'] = df['generated'].apply(textstat.lexicon_count) # Count number of words
    df['syll_lex'] = df['n_syll']/df['n_lex'] # Syllable to word ratio
    
    # Flags to indicate whether generated text has fewer words, syallables, or syll to word ratio
    df['rel_syll'] = np.where(df['n_syll'] < textstat.syllable_count(orig_text), 1, 0)
    df['rel_lex'] = np.where(df['n_lex'] < textstat.lexicon_count(orig_text), 1, 0)
    df['rel_rat'] = np.where(df['syll_lex'] < textstat.syllable_count(orig_text)/textstat.lexicon_count(orig_text), 1, 0)
    
    # Sum binary indicators of relative sentence simplicity
    df['rel_simp'] = (df['rel_syll'] + df['rel_lex'] + df['rel_rat'])/3
    
    # Fit score is weighted sum of similarity and relative sentence simplicity
    # Highest score will be chosen
    df['fit_score'] = 0.7*df['similarity'] + 0.3*df['rel_simp']
    
    # Subset data and rename columns
    df = df[['generated', 'similarity', 'rel_simp', 'fit_score']]
    df.columns = ['Generated', 'Similarity', 'Simplicity', 'Fit Score']
    df['Original'] = orig_text
    
    return df

In [9]:
# Function takes model specification for SimpleGPT2 model and input text then returns new text
def generate_text(size, mod_dir, ft_dir, ft_dat, n_steps, input_text, n_new_sent):
    # Simple GPT2 specifications
    gpt2size = size
    gpt2dir = mod_dir
    loaddir = ft_dir
    ft_mod = ft_dat + '_' + gpt2size + '_' + str(n_steps)
    
    if gpt2size == '355M':
        sent_delim = '|<EndSentence1>|'
    if gpt2size == '124M':
        sent_delim = '<|textdelim|>' # Used different delimeters based on trained model


    # Initiate tf session and load fine-tuned GPT2 model
    tf.reset_default_graph()
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess,
                  run_name = ft_mod,
                  checkpoint_dir = loaddir)
    
    # Split input text into list of sentences
    sent_list = [s for s in sent_break(InputText)]
    
    # Initialize dataframes to return
    all_new_sent = pd.DataFrame()
    best_new_sent = pd.DataFrame()
    
    x = 0 # Start counter
    for s in sent_list:
        x += 1
        orig_text = str(s).strip()
    
        gen_text = gpt2.generate(sess, 
                                 nsamples = n_new_sent,
                                 prefix = '<|startoftext|>' + orig_text + sent_delim,
                                 truncate = '<|endoftext|>',
                                 include_prefix = False,
                                 run_name = ft_mod,
                                 checkpoint_dir = loaddir,
                                 return_as_list = True,
                                 temperature = 0.8)

        AllSent = sentence_fit(gen_text, orig_text).reset_index(drop = True)
        AllSent['SentNo'] = 'Sent'+str(x)
        AllSent = AllSent.drop_duplicates()
        AllSent = AllSent[(AllSent['Generated'] != AllSent['Original']) & (AllSent['Similarity'] > 0.9)]
        
        Sent = list(AllSent[(AllSent['Fit Score'] == AllSent['Fit Score'].max())]['Generated'])[0]

        # Data frames with best fitting sentences and all sentences
        all_new_sent = all_new_sent.append(AllSent)
        best_new_sent = best_new_sent.append(pd.DataFrame({'SentNo': 'Sent'+str(x), 
                                                           'Original' : [orig_text],
                                                           'Generated' : [Sent]}))

    return all_new_sent.reset_index(drop = True), best_new_sent.reset_index(drop = True)


In [10]:
# GPT2 Model specification that are fed into "generate text" function
gpt2size = '124M' # Either '124M' or '355M'
n_steps = 2000 # Number of steps used to train model
gpt2dir = '../gpt2models' # Location where pre-trained gpt2 models stored
loaddir = '../trained_models/checkpoint' # Location where fine-tuned gpt2 models stored
ft_dat = 'wiki_sentence' # Data set on which GPT2 trained

all_new, best_new = generate_text(size = gpt2size, 
                                  mod_dir = gpt2dir, 
                                  ft_dir = loaddir, 
                                  ft_dat = ft_dat, 
                                  n_steps = n_steps, 
                                  input_text =  InputText, 
                                  n_new_sent = 3)

Loading checkpoint ../trained_models/checkpoint/wiki_sentence_124M_2000/model-2000
INFO:tensorflow:Restoring parameters from ../trained_models/checkpoint/wiki_sentence_124M_2000/model-2000


In [11]:
best_new

Unnamed: 0,SentNo,Original,Generated
0,Sent1,"When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object.","When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object."
1,Sent2,"In order for a force to qualify as having done work on an object, there must be a displacement and the force must cause the displacement.",If there is a force in a force then it must be able to move the object.


In [9]:
# Parameter specifications for gpt2 model

# GPT2 Model Size (either 124M or 355M)
gpt2size = '124M'
# Number of steps performed to train model
n_steps = 2000

# Location in which to save pretrained GPT2 model
gpt2dir = '../gpt2models'

# Location from which to load fine-tuned models
loaddir = '../trained_models/checkpoint'

# Data used to fine-tune model
ft_dat = 'wiki_sentence'

# Name of fine-tuned model
ft_mod = ft_dat + '_' + gpt2size + '_' + str(n_steps)

# Initiate tf session and load fine-tuned gpt2 model
tf.reset_default_graph()
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess,
               run_name = ft_mod,
               checkpoint_dir = loaddir)

Loading checkpoint ../trained_models/checkpoint/wiki_sentence_124M_2000/model-2000
INFO:tensorflow:Restoring parameters from ../trained_models/checkpoint/wiki_sentence_124M_2000/model-2000


In [10]:
all_new_sent = pd.DataFrame()
best_new_sent = pd.DataFrame()

x = 0
for s in sent_list:
    x += 1
    orig_text = str(s).replace('\n', '')
    
    gen_text = gpt2.generate(sess, 
                             nsamples = 5,
                             prefix = '<|startoftext|>' + orig_text + '|<EndSentence1>|',
                             truncate = '<|endoftext|>',
                             include_prefix = False,
                             run_name = ft_mod,
                             checkpoint_dir = loaddir,
                             return_as_list = True,
                             temperature = 0.8)
    
    AllSent = sentence_fit(gen_text, orig_text).sort_values(by = ['Fit Score'], ascending = False).reset_index(drop = True)
    AllSent['SentNo'] = 'Sent'+str(x)
    AllSent = AllSent.drop_duplicates()
    
    Sent = list(AllSent[AllSent['Fit Score'] == AllSent['Fit Score'].max()]['Generated Text'])[0]
    
    all_new_sent = all_new_sent.append(AllSent).reset_index(drop = True)
    
    best_new_sent = best_new_sent.append(pd.DataFrame({'SentNo': 'Sent'+str(x), 'Sentence' : [Sent]})).reset_index(drop = True)
    

In [22]:
bestsents = []
for i in best_replacements['Sentence']:
    bestsents.append(i)

In [25]:
' '.join(bestsents)

'When a force is applied to an object, it is said that work was done upon it. These must be called displacement and the force must cause the displacement.'

In [20]:
AllSent
Sent = list(AllSent[(AllSent['Fit Score'] == AllSent['Fit Score'].max()) & (AllSent['Similarity'] > 0.9)]['Generated Text'])[0]

Sent

'These must be called displacement and the force must cause the displacement.'

In [15]:
# Generate replacement text from fine-tuned gpt2 model
gen_text = gpt2.generate(sess, 
                         nsamples = 5,
                         prefix = '<|startoftext|>' + test_text + '|<EndSentence1>|',
                         truncate = '<|endoftext|>',
                         include_prefix = False,
                         run_name = ft_mod,
                         checkpoint_dir = loaddir,
                         return_as_list = True,
                         temperature = 0.8)

In [21]:
AllSent.drop_duplicates()

Unnamed: 0,Generated Text,Similarity,Simplicity,Fit Score,SentNo
0,These must be called displacement and the force must cause the displacement.,0.933517,1.0,0.953462,Sent2
1,"In order for a force to qualify as having done work on an object, there must be a displacement and the force must cause the displacement.",1.0,0.666667,0.9,Sent2
4,"In order for a force to qualify as having done work on an object, there must be a displacement and the force must create the displacement.",0.99224,0.0,0.694568,Sent2


In [120]:
SentFit = sentence_fit(gen_text, test_text).sort_values(by = ['Fit Score'], ascending = False).reset_index(drop = True)
SentFit

Unnamed: 0,Generated Text,Similarity,Simplicity,Fit Score
0,"When a force causes a displacement of the object, it is said that work was done on the object.",0.978373,1.0,0.984861
1,"When a force acts upon an object, it is said that work was done upon the object and that the displacement of the object would have had to have been done.",0.994325,0.333333,0.796028
2,"This is translated from the Greek word `` work '', which means `` to work '', `` to work '' and `` to work ''.",0.777808,0.666667,0.744465
3,"When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object.",1.0,0.0,0.7
4,"When a force acts upon an object to cause a displacement of the object, it is said that work was done upon the object.",1.0,0.0,0.7


In [128]:
Sent = list(SentFit[SentFit['Fit Score'] == SentFit['Fit Score'].max()]['Generated Text'])[0]

In [131]:
pd.DataFrame({'SentNo': 'Sent1',
              'Sentence' : [Sent]})

Unnamed: 0,SentNo,Sentence
0,Sent1,"When a force causes a displacement of the object, it is said that work was done on the object."


In [11]:
prac_text = (
    '''A wave doesn't just stop when it reaches the end of the medium. Rather, a wave will undergo certain behaviors when it encounters the end of the medium. Specifically, there will be some reflection off the boundary and some transmission into the new medium. The transmitted wave undergoes refraction (or bending) if it approaches the boundary at an angle. If the boundary is merely an obstacle implanted within the medium, and if the dimensions of the obstacle are smaller than the wavelength of the wave, then there will be very noticeable diffraction of the wave around the object. Each one of these behaviors - reflection, refraction and diffraction - is characterized by specific conceptual principles and mathematical equations'''
            )

In [18]:
sent_list = [str(s) for s in sent_break(prac_text)]
pd.DataFrame(sent_list, columns = ['sentences'])

Unnamed: 0,sentences
0,A wave doesn't just stop when it reaches the end of the medium.
1,"Rather, a wave will undergo certain behaviors when it encounters the end of the medium."
2,"Specifically, there will be some reflection off the boundary and some transmission into the new medium."
3,The transmitted wave undergoes refraction (or bending) if it approaches the boundary at an angle.
4,"If the boundary is merely an obstacle implanted within the medium, and if the dimensions of the obstacle are smaller than the wavelength of the wave, then there will be very noticeable diffraction of the wave around the object."
5,"Each one of these behaviors - reflection, refraction and diffraction - is characterized by specific conceptual principles and mathematical equations"
