# Replication Results - Final Analysis
Let's explore more about the AutoPhrase's results

In [None]:
from os import listdir
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import re
from gensim.models import Word2Vec
import pickle

#ignore future warning because of different versions in the environment
import warnings
warnings.filterwarnings("ignore") 

In [None]:
out_file_dir = '../data/out/AutoPhrase_Result/'
data_dir = '../data/'

In [None]:
phrases = []
scores = []
for f in listdir(out_file_dir):
    if 'multi-words' in f:
        multi_words = open(out_file_dir + f, 'r')
        for line in multi_words:
            line_txt = line.split()
            score = line_txt[0]
            scores.append(float(score))
            phrase = ' '.join(line_txt[1:])
            phrases.append(phrase)
        break
all_scores = pd.DataFrame({'phrase': phrases, 'score': scores})

## 1. Randomly draw 100 multi-word phrases and manually check if they are high-quality phrases
- show the result after manually checking. 
- If the phrase is high-quality, its label is 1, othersise 0.

In [None]:
selected_phrases = pd.read_csv('../data/report/report_files/sample_scores.csv')
selected_phrases

#### After manually checking, we found 80 high-quality phrases in 100 random generated phrases.
### The percentage of high-quality phrases is 80%.

## 2. Plot the Precision-recall Curve for 100 random multi-word phrases in task 1

In [None]:
y_true = selected_phrases.Label.values
y_scores = selected_phrases.score.values
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
# precision, recall, thresholds

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recall, precision, label='Logistic Regression', scalex=False, scaley=False)
ax.set_title('Precision-recall Curve')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision');

## 3. Use segmentation results & word2vec
- The segementation results has `<phrase>` between phrases, we perform data prepossessing before training our model
    - Example: `"... <phrase>support vector machine</phrase> ..."` --> `"... support_vector_machine ..."`
- We obtain phrase embedding by word2vec

In [None]:
model = Word2Vec.load("../data/report/report_files/word2vec.model")

## 4. Find similar multi-word phrases
- Random Pick 3 high-quality phrases (label is 1) from your previous annotations
- Run a similarity search among all multi-word phrases whose scores are greater than 0.5
- Report the top-5 results 

In [None]:
# random select
selected_high_quality_phrases = []
while True:
    if len(selected_high_quality_phrases) == 3:
        break
    
    ph = '_'.join(selected_phrases.query('Label == 1').sample().phrase.values[0].split())
    if ph in selected_high_quality_phrases:
        continue
    try:
        model[ph]
        selected_high_quality_phrases.append(ph)
    except:
        continue
selected_high_quality_phrases

In [None]:
df_high_score_phrases = all_scores.query('score > 0.5').reset_index(drop = True)
all_similar_phrases = []
for ph in selected_high_quality_phrases:
    similar_pairs = model.most_similar(positive = ph, topn = 100)
    top_5 = []
    counter = 0
    for pair in similar_pairs:
        if pair[0].replace('_', ' ') in df_high_score_phrases.phrase.values:
            top_5.append(pair[0].replace('_', ' '))
            counter += 1
            if counter >= 5:
                break
    all_similar_phrases.append({
        'key': ph,
        'similar_phrases': top_5
    })

In [None]:
similars_lst = []

words = []

for i in range(len(all_similar_phrases)):
    word = all_similar_phrases[i]['key']
    words.append(word.replace('_', ' '))
    similars = all_similar_phrases[i]['similar_phrases']
    for i in range(len(similars)):
        sim = similars[i]
        similars[i] = sim.replace('_', ' ')
        
    similars_lst.append(similars)       

In [None]:
pd.DataFrame(zip(similars_lst[0], similars_lst[1], similars_lst[2]), columns=words)

### Through manually checking, we think top 5 similar multi-word phrases are correct. The similar phrases and the selected phrases are in the same domain and are releted.