In [1]:
import spacy
import pandas as pd

In [2]:
phrase_list = ["Marriage fighting", "I'm not sure what to do", "my health", "Traveling on the subway to work"]

In [3]:
df = pd.read_csv('stress_management_mturk.csv')

In [4]:
# replace missing value
df.loc[df.index == 12, 'Stress3Ex'] = 'None given'

In [5]:
# put the stressors, examples, and strategies into separate dataframes
stresses = pd.concat([df['Stress1'], df['Stress2'], df['Stress3']], axis=0).reset_index(drop=True)
examples = pd.concat([df['Stress1Ex'], df['Stress2Ex'], df['Stress3Ex']], axis=0).reset_index(drop=True)
strategy = pd.concat([df['Strategy1'], df['Strategy2'], df['Strategy3']], axis=0).reset_index(drop=True)

In [6]:
# load the spacy model
nlp = spacy.load('en_core_web_lg')

# fixes the issue with the spacy library where stop words are not included with the model
for word in nlp.Defaults.stop_words:
    nlp.vocab[word].is_stop = True

In [7]:
def find_similar_phrase(search_phrase, phrases):
    """finds similar phrases in the corpus to this phrase
    
    Args: 
        search_phrase (str): 
        phrases (list): list of strings to compare with the search_phrase
        
    Returns:
        results (list): similarity scores for all documents in the 
    
    """
    results = []
    doc2 = nlp(search_phrase)
    for phrase in phrases:
        doc1 = nlp(phrase)
        results.append(doc1.similarity(doc2)) # stores similarity between the phrase and all other phrases in corpus
        
    return results

In [8]:
# separate short phrases and long phrases
short_phrases = pd.concat([df['Stress1'],
                     df['Stress2'], 
                     df['Stress3']]).reset_index(drop=True)

long_phrases = pd.concat([df['Stress1Ex'],
                     df['Stress2Ex'], 
                     df['Stress3Ex']]).reset_index(drop=True)

In [9]:
def find_similar(similar_phrase):
    """find top 5 similar phrases and output the solution"""
    # differentiates between longer phrases and shorter phrases
    if len(similar_phrase.split(' ')) < 5:
        results = find_similar_phrase(similar_phrase, short_phrases)
    else:
        results = find_similar_phrase(similar_phrase, long_phrases)

    best_matches = sorted(list(enumerate(results)), key=lambda x:-x[1])[0:5]
    print('original phrase', similar_phrase)
    for result in [x[0] for x in best_matches]:
        print('strategy:', strategy[result])

        if len(similar_phrase.split(' ')) < 5:
            print('matching short phrase:', short_phrases[result])
        else:
            print('matching long phrase:', long_phrases[result])
        print()
    print()

In [10]:
for phrase in phrase_list:
    find_similar(phrase)

original phrase Marriage fighting
strategy: Focus on the good things. Let the bad things slide.
matching short phrase: Marriage

strategy: prayer
matching short phrase: Marriage

strategy: Seeking marriage counseling
matching short phrase: Marriage

strategy: Taking a drive is my solution.
matching short phrase: Marriage

strategy: listening soft music
matching short phrase: marriage


original phrase I'm not sure what to do
strategy: Work more
matching long phrase: Not enough to do what I want to

strategy: I try to stick to a schedule in order too get tasks done in a timely manner
matching long phrase: I often feel like I don't have enough time to get everything done  

strategy: Go anywhere to be alone.
matching long phrase: He's aways thinking i'mdoing things that i'm don't need to be doing.

strategy: Try to anticipate what's going to be required of me beforehand so I'm ready for it, and always remember to be malleable
matching long phrase: there is always something going on here,