# Replication Results - Final Analysis
Let's explore more about the AutoPhrase's results

In [None]:
from os import listdir
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import re
from gensim.models import Word2Vec
import pickle

#ignore future warning because of different versions in the environment
import warnings
warnings.filterwarnings("ignore") 

In [None]:
out_file_dir = '../data/out/AutoPhrase_Result/'
data_dir = '../data/'

In [None]:
phrases = []
scores = []
for f in listdir(out_file_dir):
    if 'multi-words' in f:
        multi_words = open(out_file_dir + f, 'r')
        for line in multi_words:
            line_txt = line.split()
            score = line_txt[0]
            scores.append(float(score))
            phrase = ' '.join(line_txt[1:])
            phrases.append(phrase)
        break
all_scores = pd.DataFrame({'phrase': phrases, 'score': scores})

## 1. Randomly draw 100 multi-word phrases and manually check if they are high-quality phrases
- show the result after manually checking. 
- If the phrase is high-quality, its label is 1, othersise 0.

In [None]:
selected_phrases = pd.read_csv('../references/sample_scores.csv')
selected_phrases

#### After manually checking, we found 80 high-quality phrases in 100 random generated phrases.
### The percentage of high-quality phrases is 80%.

## 2. Plot the Precision-recall Curve for 100 random multi-word phrases in task 1

In [None]:
y_true = selected_phrases.Label.values
y_scores = selected_phrases.score.values
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
# precision, recall, thresholds

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, label='Logistic Regression', scalex=False, scaley=False)
ax.set_title('Precision-recall Curve')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision');

## 3. Use segmentation results & word2vec
- The segementation results has `<phrase>` between phrases, we perform data prepossessing before training our model
    - Example: `"... <phrase>support vector machine</phrase> ..."` --> `"... support_vector_machine ..."`
- We obtain phrase embedding by word2vec

In [None]:
model = Word2Vec.load("../data/report/word2vec.model")

## 4. Find similar multi-word phrases
- Random Pick 3 high-quality phrases (label is 1) from your previous annotations
- Run a similarity search among all multi-word phrases whose scores are greater than 0.5
- Report the top-5 results 

In [None]:
random select
selected_high_quality_phrases = selected_phrases.query('Label == 1').sample(3).phrase.values
selected_high_quality_phrases = ['_'.join(i.split()) for i in selected_high_quality_phrases]
selected_high_quality_phrases

In [None]:
# selected_high_quality_phrases = [
#      'stock_exchanges',
#      'public_service',
#      'supply_uncertainty'
#  ]

In [None]:
from scipy import spatial

def calc_similarity(phrase1, phrase2, model):
    if phrase1 == phrase2:
        return 0
    try:
        vect_1 = model[phrase1]
        vect_2 = model[phrase2]
    except:
        return 0
    return 1 - spatial.distance.cosine(model[phrase1], model[phrase2])

In [None]:
df_high_score_phrases = all_scores.query('score > 0.5').reset_index(drop = True)

# Apply the same pre-processing step on existing phrases
df_high_score_phrases.phrase = df_high_score_phrases.phrase.apply(lambda x: '_'.join(x.split()))

# Set default similarity score
df_high_score_phrases['similarity'] = 0
all_similar_phrases = []
for ph in selected_high_quality_phrases:
    df_high_score_phrases.similarity = df_high_score_phrases.phrase\
        .apply(lambda x: calc_similarity(x, ph, model))
    top_5 = df_high_score_phrases.sort_values('similarity', ascending = False).phrase.values[:5]
    all_similar_phrases.append({
        'key': ph,
        'similar_phrases': top_5
    })

In [None]:
# pd.DataFrame(all_similar_phrases)
# all_similar_phrases

In [None]:
similars_lst = []

words = []

for i in range(len(all_similar_phrases)):
    word = all_similar_phrases[i]['key']
    words.append(word.replace('_', ' '))
    similars = all_similar_phrases[i]['similar_phrases']
    for i in range(len(similars)):
        sim = similars[i]
        similars[i] = sim.replace('_', ' ')
        
    similars_lst.append(similars)       

In [None]:
pd.DataFrame(zip(similars_lst[0], similars_lst[1], similars_lst[2]), columns=words)

### Through manually checking, we think top 5 similar multi-word phrases are correct. The similar phrases and the selected phrases are in the same domain and are releted.