<a href="https://colab.research.google.com/github/arjasc5231/cs470-team21/blob/main/summarize_relevance_1129.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# drive mount. colab에 내 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
### 아마존 데이터셋 분석을 위해 추가한 라이브러리
import json
import pickle
import numpy as np

In [None]:
def combine_subtokens(tokens, relevance_score):
    '''
    combine subtokens to one word
    '''

    token = ['[CLS]', '[SEP]', '[MASK]', '[UNK]', '[PAD]']

    for i in range(len(tokens)):
        if tokens[i] in token: continue
        
        num = 1
        word = tokens[i]
        score = relevance_score[i]
        while(tokens[i+1][0] == '#'):
            word += tokens[i+1][2:]
            score += relevance_score[i+1]
            num += 1
            if tokens[i][0] != '#':
                tokens[i] = "##" + tokens[i]
            i += 1
        
        tokens.append(word)
        # average the relevance scores of subtokens
        relevance_score.append(score/num)

    # delete subtokens
    word, score = [], []
    for i in range(len(tokens)):
        if tokens[i][0] != '#':
            word.append(tokens[i])
            score.append(relevance_score[i])
    
    return word, np.array(score)


In [None]:
from operator import pos
from tqdm import tqdm
def sum_relevance_score(records, occurence_rate=1, rating_weight=1, near=1, rationale = 0.99):
    '''
    combine the relevance scores of each words with some weights

    Parameters
    -------------------------------------------
    records: numpy
        [tokens, rating, product, output(0,1), neg_expl, pos_expl, 
         true_class(0,1), pred_class(0,1)]
    
    occurence_rate: float
        threshold for rate of occurence / total records

    rating_weight: float
        weight for rating 

    near: 


    rationale: 

    Returns
    -------------------------------------------
    score_map: dictionary {'word': [positive_score, negative_score]}
        a total score for each words

    '''
    pos_score_map, neg_score_map = {}, {}
    pos_occur_map, neg_occur_map = {}, {}
    pos_word, neg_word = {}, {}

    tokens = ['[CLS]', '[SEP]', '[MASK]', '[UNK]', '[PAD]']

    pos_num, neg_num = 0,0

    for record in tqdm(records):

        rate = (record[1] - 3) * rating_weight
        if rate >= 0:   
            pos_num += 1
            score_map = pos_score_map
            occur_map = pos_occur_map
            highest_map = pos_word
        else:           
            neg_num += 1 
            score_map = neg_score_map
            occur_map = neg_occur_map
            highest_map = neg_word

        if record[0][0] != tokens[0] or record[0][-1] != tokens[1]: continue

        word, negative_score = combine_subtokens(list(record[0]), list(record[4]))
        word, positive_score = combine_subtokens(list(record[0]), list(record[5]))
        
        # normalize to range 0~1
        positive_score = (positive_score - positive_score.min()) / (positive_score.max() - positive_score.min())
        negative_score = (negative_score - negative_score.min()) / (negative_score.max() - negative_score.min())

        assert len(word) == len(positive_score) == len(negative_score)

        # relevance score을 주위 단어들로 분배
        cache, score = {}, {}
        for i in range(len(word)):
            
            if word[i] in tokens: continue

            if word[i] not in occur_map:
                occur_map[word[i]] = 0
                score_map[word[i]] = 0

            if word[i] not in cache:
                occur_map[word[i]] += 1
                cache[word[i]] = 0
                score[word[i]] = 0
            
            cache[word[i]] += 1

            # add the word with highest relevance score
            if rate >= 0: 
                if i == np.argmax(positive_score):
                    if word[i] not in highest_map: 
                        highest_map[word[i]] = 0
                    highest_map[word[i]] += 1
            else: 
                if i == np.argmax(negative_score):
                    if word[i] not in highest_map: 
                        highest_map[word[i]] = 0
                    highest_map[word[i]] += 1

            if near >= 0:
                min_index = max(0, i-near)
                max_index = min(len(word), i+near)
                for j in range(min_index, max_index):

                    if i == j: continue

                    # # pass built-in token
                    if word[j] in tokens: continue

                    if word[j] not in occur_map:
                        occur_map[word[j]] = 0
                        score_map[word[j]] = 0
                        score[word[j]] = 0

                    # # add score with weights
                    # for p in range(len(scores)): 
                    #     score_map[word[j]][p] += rate * scores[p][i] / (max_index - min_index)

                    # # positive review: add only positive score
                    # # negative review: add only negative score
                    # if rate >= 0: 
                    #     score_map[word[j]][0] += rate * scores[0][i] / (max_index - min_index)
                    # else: 
                    #     score_map[word[j]][1] += rate * scores[1][i] / (max_index - min_index)

                    if rate >= 0: 
                        score_map[word[j]] += rate * positive_score[i] / (max_index - min_index)
                        score_map[word[j]] -= rate * negative_score[i] / (max_index - min_index)
                    else: 
                        score_map[word[j]] += rate * negative_score[i] / (max_index - min_index)
                        score_map[word[j]] -= rate * positive_score[i] / (max_index - min_index)

                    continue


            else:                     
                if rate >= 0: 
                    score[word[i]] += rate * positive_score[i]
                    # score[word[i]] -= rate * negative_score[i]
                else: 
                    score[word[i]] += rate * negative_score[i]
                    # score[word[i]] -= rate * positive_score[i]
        for tok, num in cache.items():
            score_map[tok] += score[tok] / num


    print(f"[sum_relevance_score] positive reviews: {pos_num}, negative reviews: {neg_num}")
    # delete words with less occurence
    for word, occurence in pos_occur_map.items():
        if occurence < pos_num * occurence_rate:
            del pos_score_map[word]
            continue

        pos_score_map[word] /= occurence

    for word, occurence in neg_occur_map.items():
        if occurence < neg_num * occurence_rate:
            del neg_score_map[word]
            continue

        neg_score_map[word] /= occurence

    # print(f"[sum_relevance_score] occurence - '[': {neg_occur_map[']']}, ']': {neg_occur_map['[']}")

    return pos_score_map, neg_score_map, pos_occur_map, neg_occur_map, pos_word, neg_word

In [None]:
book_records_binary = np.load('/content/drive/MyDrive/CS470_team_2in1/colab/explanation/balanced_only1000model/amazon_book_expl-transformer_attribution-balanced_regression_only5000.npy', allow_pickle=True)

In [None]:
pos_map, neg_map, pos_occur_map, neg_occur_map, pos_word, neg_word = sum_relevance_score(book_records_binary, occurence_rate = 0.01, rating_weight=1, near=-1)
np.save(f"/content/drive/MyDrive/CS470_team_2in1/colab/summarize/record_all", sorted(pos_map.items(), key = lambda item: item[1], reverse=True)[:20])

print("\n")
print(sorted(pos_map.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_map.items(), key = lambda item: item[1], reverse=True)[:20])
print("\n")
print(sorted(pos_occur_map.items(), key = lambda item: item[1], reverse=True)[:50])
print(sorted(neg_occur_map.items(), key = lambda item: item[1], reverse=True)[:50])
print("\n")
print(sorted(pos_word.items(), key = lambda item: item[1], reverse=True)[:50])
print(sorted(neg_word.items(), key = lambda item: item[1], reverse=True)[:50])

100%|██████████| 5000/5000 [00:14<00:00, 352.66it/s]


[sum_relevance_score] positive reviews: 4398, negative reviews: 602


[('appalachian', 1.4964379239567358), ('bill', 0.9154671192555716), ('island', 0.7592160584541027), ('australian', 0.7546424165673082), ('local', 0.7391679271369712), ('ms', 0.7348293989712344), ('southern', 0.7330291703108752), ('rollins', 0.7197133592259561), ('americans', 0.7042976901605044), ('continent', 0.7006952267642037), ('country', 0.6914773223636017), ('!', 0.6885477217888775), ('this', 0.6749706002282332), ('trail', 0.6722960539221633), ('thank', 0.6603914788019147), ('south', 0.6603898556550765), ('sal', 0.6511935246525873), ('our', 0.6491117750878516), ('bones', 0.6459678574310012), ('sisters', 0.6435838545478827)]
[('held', -0.047813241707357575), ('ago', -0.06049271221234306), ('eventually', -0.06359609385412059), ('fascinating', -0.06572562809976543), ('bring', -0.06804536871536565), ('able', -0.07149980057307953), ('fully', -0.07607188244945719), ('until', -0.0777788713637395), ('hoped', -0.07975481

In [None]:
product = {}
for record in book_records_binary:
    if record[2] not in product:
        product[record[2]] = 0
    product[record[2]] += 1
print(len(product))
print(product)

96
{'0060755334': 256, '0060751967': 100, '0060740221': 867, '0060761288': 116, '0060760060': 1, '0060758716': 7, '006073731X': 26, '0060747293': 12, '006076032X': 1, '0060763426': 42, '0060763760': 9, '0060758317': 2, '0060763868': 2, '0060756632': 13, '0060762489': 1, '0060753854': 5, '0060759836': 296, '0060745053': 67, '0060755504': 12, '0060761334': 48, '0060748125': 88, '0060760958': 64, '0060763515': 13, '0060763620': 63, '0060760885': 223, '0060760257': 48, '0060757612': 29, '0060761857': 46, '0060763116': 24, '006076287X': 43, '0060765410': 12, '0060760281': 10, '006076208X': 34, '0060762519': 1, '0060745258': 7, '0060755342': 15, '0060754001': 10, '0060758465': 1, '0060761881': 2, '0060760664': 9, '0060753455': 1, '0060761784': 27, '0060751002': 25, '0060739428': 113, '0060763876': 452, '0060765127': 7, '0060759682': 7, '0060765712': 116, '0060765402': 50, '0060759704': 5, '0060756047': 6, '0060765364': 37, '0060765704': 6, '0060764961': 11, '0060763272': 47, '0060755229': 24

In [None]:
product_name = '0002051850'
record_0001384198 = []
for record in book_records_binary:
    if record[2] == product_name:
        record_0001384198.append(record)

pos_map, neg_map, pos_occur_map, neg_occur_map, pos_word, neg_word = sum_relevance_score(record_0001384198, occurence_rate = 0.01, rating_weight=1, near=-1)
np.save(f"/content/drive/MyDrive/CS470_team_2in1/colab/summarize/record_{product_name}", sorted(pos_map.items(), key = lambda item: item[1], reverse=True)[:20])

print("\n")
print(sorted(pos_map.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_map.items(), key = lambda item: item[1], reverse=True)[:20])
print("\n")
print(sorted(pos_occur_map.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_occur_map.items(), key = lambda item: item[1], reverse=True)[:20])
print("\n")
print(sorted(pos_word.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_word.items(), key = lambda item: item[1], reverse=True)[:20])     

0it [00:00, ?it/s]

[sum_relevance_score] positive reviews: 0, negative reviews: 0







[]
[]


[]
[]


[]
[]


In [None]:
book_records_regression = np.load('/content/drive/MyDrive/CS470_team_2in1/colab/explanation/amazon_book_expl-transformer_attribution-regression_only1000.npy', allow_pickle=True)

FileNotFoundError: ignored

In [None]:
pos_map, neg_map, pos_occur_map, neg_occur_map, pos_word, neg_word = sum_relevance_score(book_records_regression, occurence_rate = 0.01, rating_weight=1, near=3)

print("\n")
print(sorted(pos_map.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_map.items(), key = lambda item: item[1], reverse=True)[:20])
print("\n")
print(sorted(pos_occur_map.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_occur_map.items(), key = lambda item: item[1], reverse=True)[:20])
print("\n")
print(sorted(pos_word.items(), key = lambda item: item[1], reverse=True)[:20])
print(sorted(neg_word.items(), key = lambda item: item[1], reverse=True)[:20])