## In this notebook

- We contruct a combination ranking based on the top 1000 terms and ranks
    - For now, we are going to select top terms (or nearest terms) from across all keyterm lists after deduplication of course

- We store the phrase2idx and phrase2vector mapping as a pickle

- We construct and save a nearest neighbor object to help find the closest phrases

In [24]:
import importlib
importlib.reload(phrase_filters)
importlib.reload(phrase_extraction)

<module 'phrase_extraction' from '/home/nino/GetGabby/notebooks/phrase_extraction.py'>

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("../data/test_data.xlsx", index_col=0)

In [3]:
import phrase_extraction
import phrase_filters

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = phrase_extraction.featurization(df)

In [5]:
df.head()

Unnamed: 0,title,review,review_date,will_recommend,contents,review_tokens
0,Simply the best,Outstanding picture color and brightness. I ch...,"April 8, 2022",Yes,Simply the best. Outstanding picture color and...,"[simply, the, best, ., outstanding, picture, c..."
1,"65"" Bravia XR A90J Smart TV & JBL 501 Soundbar","Excellent installation job of my 65"" Sony XR A...","October 30, 2021",Yes,"65"" Bravia XR A90J Smart TV & JBL 501 Soundbar...","[65, "", bravia, xr, a90j, smart, tv, &, jbl, 5..."
2,The Best of All,Best color of any that TV I have see. Even the...,"December 12, 2021",Yes,The Best of All. Best color of any that TV I h...,"[the, best, of, all, ., best, color, of, any, ..."
3,"A JAW DROPPING, STUNNING MASTERPIECE!",I've owned several oleds from LG and Sony and ...,"May 11, 2021",Yes,"A JAW DROPPING, STUNNING MASTERPIECE!. I've ow...","[a, jaw, dropping, ,, stunning, masterpiece, !..."
4,Absolutely AMAZING!,Ive been an Oled fan since they became availab...,"April 22, 2022",Yes,Absolutely AMAZING!. Ive been an Oled fan sinc...,"[absolutely, amazing, !, ., i, ve, been, an, o..."


## Let's get the keyword lists from each algorithm

In [6]:
phrases_mi = phrase_extraction.keyterm_extraction_mutual_information(df)
phrases_tfidf = phrase_extraction.keyterm_extraction_tfidf(df)
phrases_freq = phrase_extraction.keyterm_extraction_frequency(df)
phrases_yake = phrase_extraction.keyterm_extraction_yake(df)
phrases_trank = phrase_extraction.keyterm_extraction_textrank(df)
phrases_scake = phrase_extraction.keyterm_extraction_scake(df)
phrases_ent_nc = phrase_extraction.keyterm_extraction_entities_and_noun_chunks(df)

### lets run the filters selectively

In [7]:
brand_model_terms = ['Sony', 'LG', 'Bravia', 'a09','xr']

In [8]:
phrases_mi = phrase_filters.filter_phrases_containing_stopwords(phrases_mi)
phrases_mi = phrase_filters.filter_phrases_containing_punctuation(phrases_mi)
phrases_mi = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_mi, brand_model_terms)
phrases_mi.shape

  return df[ ~df['phrase'].str.contains(pattern, case=False)]


(4388, 2)

In [9]:
phrases_tfidf = phrase_filters.filter_phrases_containing_stopwords(phrases_tfidf)
phrases_tfidf = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_tfidf, brand_model_terms)
phrases_tfidf.shape

(6209, 2)

In [10]:
phrases_freq = phrase_filters.filter_phrases_containing_stopwords(phrases_freq)
phrases_freq = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_freq, brand_model_terms)
phrases_freq.shape

  return df[ ~df['phrase'].str.contains(pattern, case=False)]


(6209, 2)

In [11]:
phrases_ent_nc = phrase_filters.filter_phrases_containing_stopwords(phrases_ent_nc)
phrases_ent_nc = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_ent_nc, brand_model_terms)
phrases_ent_nc.shape

  return df[ ~df['phrase'].str.contains(pattern, case=False)]


(1424, 2)

In [12]:
phrases_scake = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_scake, brand_model_terms)
phrases_yake = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_yake, brand_model_terms)
phrases_trank = phrase_filters.filter_phrases_containing_brand_model_terms(phrases_trank, brand_model_terms)

  return df[ ~df['phrase'].str.contains(pattern, case=False)]


In [13]:
[pdf.shape for pdf in [phrases_scake, phrases_yake, phrases_trank]]

[(884, 2), (958, 2), (881, 2)]

## Creating indexes and vector stores for phrases

In [15]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:
import pickle

In [54]:
def generate_top_keyterms_data_structures(df, save_file_prefix, topk=1000):
    topkdf = df.sort_values('score', ascending=False).head(topk)
    phrase2idx = dict(zip(topkdf['phrase'].tolist(), range(topkdf.shape[0])))
    phrase2vector = dict(zip(topkdf['phrase'].tolist(), topkdf['phrase'].apply(lambda x: nlp(x).vector)))

    with open(save_file_prefix + '_phrase2idx.pkl', 'wb') as outf:
        pickle.dump(phrase2idx, outf)

    with open(save_file_prefix + '_phrase2vector.pkl', 'wb') as outf:
        pickle.dump(phrase2vector, outf)

    topkdf.to_pickle(save_file_prefix + '_df.pkl')
    
    return phrase2idx, phrase2vector

In [23]:
phrase_mi2idx, phrases_mi2vec = generate_top_keyterms_data_structures(phrases_mi, 'phrase_mi')

In [28]:
phrase_tfidf2idx, phrases_tfidf2vec = generate_top_keyterms_data_structures(phrases_tfidf, 'phrase_tfidf')

In [53]:
_, __ = generate_top_keyterms_data_structures(phrases_freq, 'phrases_freq')
_, __ = generate_top_keyterms_data_structures(phrases_yake, 'phrases_yake')
_, __ = generate_top_keyterms_data_structures(phrases_scake, 'phrases_scake')
_, __ = generate_top_keyterms_data_structures(phrases_trank, 'phrases_trank')
_, __ = generate_top_keyterms_data_structures(phrases_ent_nc, 'phrases_ent_nc')

## Nearest Neighor searches for keyterms

In [29]:
from sklearn.neighbors import NearestNeighbors

In [30]:
import numpy as np

In [31]:
phrase_mi_NN = NearestNeighbors(n_neighbors=5).fit(np.vstack(list(phrases_mi2vec.values())))

In [32]:
phrase_tfidf_NN = NearestNeighbors(n_neighbors=5).fit(np.vstack(list(phrases_tfidf2vec.values())))

In [34]:
phrases_mi.head()

Unnamed: 0,phrase,score
0,4 k,29.823529
2,dolby vision,15.810277
3,picture quality,13.313034
7,itv hub,7.363636
8,master series,7.062069


In [48]:
qvec = phrases_mi2vec['master series']

In [49]:
qvec.shape

(96,)

In [50]:
dists, idxs = phrase_tfidf_NN.kneighbors(qvec.reshape((1, -1)))
idxs

array([[706, 568, 529, 638, 671]])

In [51]:
idx2phrases_tfidf = {v:k for k, v in phrase_tfidf2idx.items()}

In [52]:
[idx2phrases_tfidf[i] for i in idxs.reshape(-1)]
    

['inch master series',
 'resolution netflix',
 'netflix picture',
 'panasonic plasma',
 'service award']

### Build main data struct

In [None]:
#TODO: load data structs
algo_dfs = [phrases_mi, phrases_tfidf, phrases_freq, phrases_yake, phrases_scake, phrases_trank, phrases_ent_nc]