### Even after trimming the sentiment-aspect pairs, we are still left with 39k pairs. Many of them are semantically identical, like "good beer", "fantastic product".

### We use our GloVe model (trained on review corpus) to identify those semantically similar pairs and cluster them together and adding their frequency.

### This leaves us with 14080 pairs of semantically different sentiment-aspect pair

In [1]:
import pickle
with open("aspect_freq/bigram_unique_50_in_vocab.pickle", "rb") as f:
    bgrams = pickle.load(f)
with open("aspect_freq/bigram_kw_50.pickle", "rb") as f:
    bgrams_kw = pickle.load(f)
len(bgrams), min(bgrams_kw.values())

(39553, 50)

In [2]:
print(len(set(bgrams).difference(set(bgrams_kw.keys()))))
del bgrams_kw

0


In [3]:
from gensim.models import KeyedVectors

In [4]:
gl_model = KeyedVectors.load("finetuned/glove.840B.300D.ft.model")
model_vocab = set(gl_model.wv.index_to_key)

In [5]:
sentiments =[]
aspects =[]
for i in bgrams:
    sent, asp = i.split()
    sentiments.append(sent)
    aspects.append(asp)

len(sentiments), len(aspects)

(39553, 39553)

In [6]:
from collections import Counter
sentiments = dict(Counter(sentiments).most_common())
aspects = dict(Counter(aspects).most_common())

len(sentiments), len(aspects)

(2361, 2484)

In [7]:
from collections import defaultdict
sent_asp_dict = defaultdict(list)
for phrase in bgrams:
    sent, asp = phrase.split()
    sent_asp_dict[asp].append(sent)

In [8]:
def similar_words(word, threshold):
    return [i for i in gl_model.wv.most_similar(word, topn=40) if threshold <= i[1]]


def get_pooled(words: list[str], thres=0.30):
    wrd_set = set(words)
    clusters = {}
    for ind, wrd in enumerate(words):
        print(f"[*] Checking {ind}: {wrd}")
        if len(wrd_set) == 0:
            break
        if wrd in wrd_set:
            clusters[wrd] = [wrd]
            print(f"\t[~] Adding: {wrd}")
            for word in similar_words(wrd, threshold=thres):
                if word[0] not in wrd_set:
                    continue
                assert word[0] in wrd_set
                print(f"\t[~] Adding: {word}")
                clusters[wrd].append(word[0])
                wrd_set.remove(word[0])
            wrd_set.remove(wrd)
    return clusters, wrd_set


def get_pooled_aspect(aspects: list[str]):
    return get_pooled(aspects)

In [9]:
aspect_cluster, wrd_set = get_pooled(aspects.keys(), thres = 0.60)
len(aspect_cluster), len(wrd_set)

[*] Checking 0: beer
	[~] Adding: beer
	[~] Adding: ('brew', 0.8390510082244873)
	[~] Adding: ('offering', 0.6275783777236938)
	[~] Adding: ('product', 0.6044486165046692)
[*] Checking 1: head
	[~] Adding: head
	[~] Adding: ('foam', 0.7261395454406738)
	[~] Adding: ('froth', 0.6203668117523193)
[*] Checking 2: flavor
	[~] Adding: flavor
	[~] Adding: ('taste', 0.8560091853141785)
	[~] Adding: ('flavour', 0.7245732545852661)
	[~] Adding: ('character', 0.6585783362388611)
	[~] Adding: ('flavoring', 0.6328192949295044)
	[~] Adding: ('undertone', 0.6086916923522949)
	[~] Adding: ('presence', 0.6015477776527405)
[*] Checking 3: aroma
	[~] Adding: aroma
	[~] Adding: ('nose', 0.8760867714881897)
	[~] Adding: ('aromas', 0.7983217239379883)
	[~] Adding: ('smell', 0.6830050349235535)
	[~] Adding: ('scent', 0.6718092560768127)
[*] Checking 4: taste
[*] Checking 5: malt
	[~] Adding: malt
	[~] Adding: ('maltiness', 0.7825539708137512)
	[~] Adding: ('malty', 0.7434696555137634)
	[~] Adding: ('grain',

(1763, 0)

In [10]:
aspect_cluster

{'beer': ['beer', 'brew', 'offering', 'product'],
 'head': ['head', 'foam', 'froth'],
 'flavor': ['flavor',
  'taste',
  'flavour',
  'character',
  'flavoring',
  'undertone',
  'presence'],
 'aroma': ['aroma', 'nose', 'aromas', 'smell', 'scent'],
 'malt': ['malt', 'maltiness', 'malty', 'grain'],
 'hop': ['hop', 'hoppiness', 'hopping'],
 'note': ['note',
  'component',
  'element',
  'twang',
  'tang',
  'touch',
  'undercurrent'],
 'finish': ['finish', 'aftertaste', 'ending', 'finishing', 'finsih', 'end'],
 'body': ['body', 'bodied'],
 'bitterness': ['bitterness',
  'bittering',
  'bite',
  'bitter',
  'dryness',
  'astringency',
  'biterness',
  'profile',
  'acidity',
  'sweetness'],
 'ale': ['ale', 'ale-'],
 'color': ['color', 'hue', 'colour'],
 'carbonation': ['carbonation',
  'carb',
  'effervescence',
  'co2',
  'carbo',
  'carbination'],
 'bottle': ['bottle', 'bomber'],
 'fruit': ['fruit', 'fruitiness', 'fruity'],
 'mouthfeel': ['mouthfeel', 'feel', 'palate', 'texture', 'mouth

In [11]:
def get_pooled_sentiments(aspect_cluster: dict[str, list[str]]):
    final_clusters = defaultdict(dict)
    for key, val in aspect_cluster.items():
        print(f"[*] Examining Cluster: {key}")
        pooled_sentiments = []
        for asp in val:
            pooled_sentiments.extend(sent_asp_dict[asp])
        final_clusters[key] = get_pooled(pooled_sentiments, thres = 0.50)[0]
        print("===*===",f"Processed Cluster {key}".center(40),"===*===")
    return dict(final_clusters)

In [12]:
sent_sub_cluster = get_pooled_sentiments(aspect_cluster)

[*] Examining Cluster: beer
[*] Checking 0: good
	[~] Adding: good
	[~] Adding: ('decent', 0.7850549221038818)
	[~] Adding: ('great', 0.7837615609169006)
	[~] Adding: ('excellent', 0.7233175039291382)
	[~] Adding: ('solid', 0.7029665112495422)
	[~] Adding: ('nice', 0.6893203258514404)
	[~] Adding: ('terrific', 0.5866356492042542)
	[~] Adding: ('ok', 0.5792714357376099)
	[~] Adding: ('okay', 0.5772217512130737)
	[~] Adding: ('exceptional', 0.570579469203949)
	[~] Adding: ('outstanding', 0.569678783416748)
	[~] Adding: ('fantastic', 0.5661064386367798)
	[~] Adding: ('impressive', 0.5593836307525635)
	[~] Adding: ('reasonable', 0.5559771656990051)
	[~] Adding: ('superb', 0.5555644631385803)
	[~] Adding: ('alright', 0.54973304271698)
	[~] Adding: ('fair', 0.5407900810241699)
	[~] Adding: ('perfect', 0.5390875935554504)
	[~] Adding: ('awesome', 0.5378066301345825)
	[~] Adding: ('fine', 0.5371630191802979)
	[~] Adding: ('average', 0.5285423398017883)
	[~] Adding: ('respectable', 0.5168969035

In [13]:
bgrams_ = set(bgrams)
final_pool = defaultdict(list)
for key_asp, val in sent_sub_cluster.items():
    print(f"[*] Examining key aspect: {key_asp}")
    pooled_sents = val
    pooled_aspects = aspect_cluster[key_asp]
    for key_sent, sents in pooled_sents.items():
        print(f"\t[*] Examining key pair: '{key_sent} {key_asp}'")
        for sent in sents:
            for asp in pooled_aspects:
                pair = f"{sent} {asp}"
                if pair in bgrams_:
                    print(f"\t\t[~] Added {pair}")
                    final_pool[f"{key_sent} {key_asp}"].append(pair)
final_pool = dict(final_pool)

[*] Examining key aspect: beer
	[*] Examining key pair: 'good beer'
		[~] Added good beer
		[~] Added good brew
		[~] Added good offering
		[~] Added good product
		[~] Added decent beer
		[~] Added decent brew
		[~] Added decent offering
		[~] Added decent product
		[~] Added great beer
		[~] Added great brew
		[~] Added great offering
		[~] Added great product
		[~] Added excellent beer
		[~] Added excellent brew
		[~] Added excellent offering
		[~] Added excellent product
		[~] Added solid beer
		[~] Added solid brew
		[~] Added solid offering
		[~] Added solid product
		[~] Added nice beer
		[~] Added nice brew
		[~] Added nice offering
		[~] Added nice product
		[~] Added terrific beer
		[~] Added terrific brew
		[~] Added terrific offering
		[~] Added ok beer
		[~] Added ok brew
		[~] Added ok offering
		[~] Added okay beer
		[~] Added okay brew
		[~] Added exceptional beer
		[~] Added exceptional brew
		[~] Added exceptional offering
		[~] Added outstanding beer
		[~] Added outs

In [14]:
len(final_pool)

14080

In [15]:
sx = set(bgrams)
for vals in final_pool.values():
    for val in vals:
        sx.remove(val)
assert len(sx) == 0

In [16]:
with open("aspect_freq/pair_clusters_50.pickle", "wb") as f:
    pickle.dump(final_pool, f)

In [17]:
from pprint import pprint
pprint(final_pool, sort_dicts=False)

{'good beer': ['good beer',
               'good brew',
               'good offering',
               'good product',
               'decent beer',
               'decent brew',
               'decent offering',
               'decent product',
               'great beer',
               'great brew',
               'great offering',
               'great product',
               'excellent beer',
               'excellent brew',
               'excellent offering',
               'excellent product',
               'solid beer',
               'solid brew',
               'solid offering',
               'solid product',
               'nice beer',
               'nice brew',
               'nice offering',
               'nice product',
               'terrific beer',
               'terrific brew',
               'terrific offering',
               'ok beer',
               'ok brew',
               'ok offering',
               'okay beer',
               'okay brew',
            