In [15]:
# Setup
%matplotlib inline
%load_ext autoreload
%autoreload 2
import warnings
import spacy
import pickle
import myUtils
import os
import csv
import matplotlib.pyplot as plt
from myUtils import *
from models.utils import *

SEED = 84
torch.manual_seed(SEED)
warnings.simplefilter("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# can be sentiment/offensive/corona
dataset_name = 'corona'
model_type = 'tinybert'
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
sorting = 'polarity'
folder_name = f'results/{dataset_name}/{sorting}'

In [17]:
model = torch.jit.load(f'models/{model_type}/{dataset_name}/traced.pt').to(device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
myUtils.model = model
myUtils.tokenizer = tokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)

In [4]:
nlp = spacy.load('en_core_web_sm')

# Loading Results

In [5]:
explanations  = pickle.load(open(f"{folder_name}/exps_list.pickle", "rb" ))
anchor_examples = pickle.load(open(f"{folder_name}/anchor_examples.pickle", "rb" ))

In [6]:
print(f'anchor examples len: {len(anchor_examples)}')
print(f'explanations len: {len(explanations)}')

anchor examples len: 3744
explanations len: 68928


In [7]:
# need in the new version of transformers
#torch._C._jit_set_texpr_fuser_enabled(False)
if not os.path.exists(f"{folder_name}/predictions.pickle"):
    predictions = [predict_sentences([tokenizer.tokenize(anchor_example)])[0] for anchor_example in anchor_examples]
    pickle.dump(predictions, open(f"{folder_name}/predictions.pickle", "wb" ))
    
labels = pickle.load(open(f"{folder_name}/predictions.pickle", "rb" ))

In [8]:
from functools import reduce
# get all anchor above 0.95, multiple in a sentence but word counts only once in a sentence
def get_best(explanations):
    best_exps = dict()
    for exp in explanations:
        if exp.precision < 0.95:
            continue
        if exp.index not in best_exps.keys():
            best_exps[exp.index]=[exp]
        # if word already appeard in sentence
        elif any([cur_exp.names[0]==exp.names[0] for cur_exp in best_exps[exp.index]]):
            continue
        else:
            best_exps[exp.index].append(exp)
    print(len(best_exps))
    return reduce(lambda x,y: x+y, best_exps.values())

In [9]:
from collections import Counter

def get_anchor_occurences(explanations):
    c = Counter()
    for exp in explanations:
        c.update([exp.names[0]])
 
    return c

def get_normal_occurences(sentences, anchor_occurences):
    c = Counter()
    for sentence in sentences:
        c.update([x.text for x in nlp.tokenizer(sentence)])
   
    #removing occurences of the words as anchor
    for word in anchor_occurences.keys():
        c[word]-=anchor_occurences[word]
        
    return c

def smooth_before(normal_occurences, anchor_occurences_list):
    for w in normal_occurences:
        normal_occurences[w]+=1
        for anchor_occurences in anchor_occurences_list:
            anchor_occurences[w]+=1

def smooth_after(teta1, type_occurences):
    # removing words we added 1 at the start smooth
    words = list(teta1.keys())
    for word in words:
        if type_occurences[word]<=1:
            del teta1[word]
            
    min_val = min(teta1.values()) 
    if min_val<0:
        for w in teta1:
            teta1[w]-= min_val
        sum_val = sum(teta1.values())
        for w in teta1:
            teta1[w]= teta1[w]/sum_val

In [10]:
def calculate_teta0(normal_occurences):
    teta0 = dict()
    sum_occurences = sum(normal_occurences.values())
    for word, count in normal_occurences.items():
        teta0[word] = count/sum_occurences
    
    return teta0

def calculate_teta1(anchor_occurences, teta0, alpha):
    teta1 = dict()
    sum_occurences = sum(anchor_occurences.values())
    for word, count in anchor_occurences.items():
        teta1[word] = count/sum_occurences -(1-alpha)*teta0[word]
        teta1[word] = teta1[word]/alpha
    
    return teta1

In [11]:
def calculate_scores():
    alphas = [0.95, 0.8, 0.65, 0.5]
    dfs = []
    columns = ['name', 'anchor score', 'type occurences', 'total occurences','+%', '-%', 'both', 'normal']
    
    exps = get_best(explanations)
    pos_exps = [exp for exp in exps if labels[exp.index]==0]
    neg_exps = [exp for exp in exps if labels[exp.index]==1]
    
    anchor_occurences = get_anchor_occurences(exps)
    pos_occurences = get_anchor_occurences(pos_exps)
    neg_occurences = get_anchor_occurences(neg_exps)
    
    normal_occurences = get_normal_occurences(anchor_examples, anchor_occurences)
    smooth_before(normal_occurences, [pos_occurences, neg_occurences])

    teta0 = calculate_teta0(normal_occurences)
    
    
    for alpha in alphas:
        df_pos, df_neg = [], []
        
        teta_pos = calculate_teta1(pos_occurences, teta0, alpha)
        smooth_after(teta_pos, pos_occurences)
        
        teta_neg = calculate_teta1(neg_occurences, teta0, alpha)
        smooth_after(teta_neg, neg_occurences)
        
        # substracting 1 because of the smoothing
        for anchor, score in teta_pos.items():
            pos_percent = round((pos_occurences[anchor]-1)/anchor_occurences[anchor], 2)
            neg_percent = 1-pos_percent
            both = (pos_occurences[anchor]-1)>0 and (neg_occurences[anchor]-1)>0
            df_pos.append([anchor, score , pos_occurences[anchor]-1, anchor_occurences[anchor], pos_percent, neg_percent, both,  normal_occurences[anchor]-1]) 
            
        
        for anchor, score in teta_neg.items():
            pos_percent = round((pos_occurences[anchor]-1)/anchor_occurences[anchor], 2)
            neg_percent = 1-pos_percent
            both = (pos_occurences[anchor]-1)>0 and (neg_occurences[anchor]-1)>0
            df_neg.append([anchor, score , neg_occurences[anchor]-1, anchor_occurences[anchor], pos_percent, neg_percent, both,  normal_occurences[anchor]-1]) 
            
        df_pos.sort(key=lambda exp: -exp[1])
        df_neg.sort(key=lambda exp: -exp[1])
        df_pos = pd.DataFrame(data = df_pos, columns = columns ).set_index('name')
        df_neg = pd.DataFrame(data = df_neg, columns = columns ).set_index('name')
        
        dfs.extend([df_pos, df_neg])
        
    writer = pd.ExcelWriter(f'{folder_name}/scores.xlsx',engine='xlsxwriter') 
    
    workbook=writer.book
    worksheet=workbook.add_worksheet('Sheet1')
    writer.sheets['Sheet1'] = worksheet
    
    cur_col = 0
    is_positive = False
    alphas = np.repeat(alphas, 2)
    
    for df, alpha in zip(dfs, alphas):
        cur_type = 'positive' if is_positive else 'negative'
        is_positive = not is_positive
        worksheet.write(0, cur_col, f'{alpha}-{cur_type}')
        df.to_excel(writer, sheet_name=f'Sheet1', startrow=1, startcol=cur_col)
        cur_col+= len(columns) + 1

    writer.save()

In [12]:
calculate_scores()

2156


In [13]:
def compare_loss(path1, path2):
    results = []
    
    def intersect_df(d1, d2, top):
        s1 = set(d1.head(top).tolist())
        s2 = set(d2.head(top).tolist())
        percentage = len(s1.intersection(s2))/top
        
        return percentage
              
    alphas = [0.95, 0.8, 0.65, 0.5]
    top = 25
    df1 = pd.read_excel(path1).drop(0)
    df2 = pd.read_excel(path2).drop(0)
    
    for alpha in alphas:
      
        pos_percentage = intersect_df(df1[f'{alpha}-positive'], df2[f'{alpha}-positive'], top)
        
        neg_percentage = intersect_df(df1[f'{alpha}-negative'], df2[f'{alpha}-negative'], top)
        
        results.append([alpha, pos_percentage, neg_percentage])
    
    df = pd.DataFrame(data = results, columns = ['alpha', 'pos', 'neg']).set_index('alpha')
    return df

In [14]:
compare_loss(f'{folder_name}/scores.xlsx', f'results/{dataset_name}/{sorting}/scores.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'results/corona/polarity/scores.xlsx'

In [None]:
def present_alpha_monitor():
    top = 25
    df = pd.read_excel(f'{folder_name}/scores.xlsx').drop(0).head(top)
    
    with open(f'{folder_name}/pos_monitor.csv', "r") as f:
        reader = csv.reader(f)
        pos_lines = [line for line in reader]

    with open(f'{folder_name}/neg_monitor.csv', "r") as f:
        reader = csv.reader(f)
        neg_lines = [line for line in reader]
        
    with open(f'{folder_name}/time_monitor.csv', "r") as f:
        reader = csv.reader(f)
        time_lines = [float(line[0]) for line in reader]

    alphas = [0.95, 0.8, 0.65, 0.5]
    results = dict.fromkeys(alphas, {'pos': [], 'neg': []})
    
    for i, alpha in enumerate(alphas): 
        top_pos = set(df[f'{alpha}-positive'].to_list())
        top_neg = set(df[f'{alpha}-negative'].to_list())
        
        results[alpha]['pos'] = [len(top_pos.intersection(set(line)))/top for line in pos_lines]
        results[alpha]['neg'] = [len(top_neg.intersection(set(line)))/top for line in neg_lines]
        
        plt.plot(time_lines, results[alpha]['pos'], label = 'positive')
        plt.plot(time_lines, results[alpha]['neg'], label = 'negative')
        plt.xlabel('time (minutes)')
        plt.ylabel('percent')

        plt.title(alpha)
        plt.legend()
        plt.show()
    

In [None]:
present_alpha_monitor()

In [None]:
def present_group_monitor():
    top = 25
    
    with open(f'{folder_name}/pos_monitor.csv', "r") as f:
        reader = csv.reader(f)
        pos_lines = [line for line in reader]

    with open(f'{folder_name}/neg_monitor.csv', "r") as f:
        reader = csv.reader(f)
        neg_lines = [line for line in reader]
        
    with open(f'{folder_name}/time_monitor.csv', "r") as f:
        reader = csv.reader(f)
        time_lines = [float(line[0]) for line in reader]

    results = dict()
    
    top_pos = set(pos_lines[-1])
    top_neg = set(neg_lines[-1])

    results['pos'] = [len(top_pos.intersection(set(line)))/top for line in pos_lines]
    results['neg'] = [len(top_neg.intersection(set(line)))/top for line in neg_lines]

    plt.plot(time_lines, results['pos'], label = 'positive')
    plt.plot(time_lines, results['neg'], label = 'negative')
    plt.xlabel('time (minutes)')
    plt.ylabel('percent')

    plt.legend()
    plt.show()
    

In [None]:
present_group_monitor()

In [None]:
from collections import defaultdict
def get_lines(path):
    with open(path, "r") as f:
        reader = csv.reader(f)
        return [line for line in reader]
    
def present_deltas_monitor(deltas):
    """ 
    compare final topk for different deltas to the default delta (0.1)
    """
    top = 25
        
    default_pos_lines = get_lines(f'{folder_name}/{0.1}/pos_monitor.csv')
    default_neg_lines = get_lines(f'{folder_name}/{0.1}/neg_monitor.csv')
    default_time_lines = [float(line[0]) for line in get_lines(f'{folder_name}/{0.1}/time_monitor.csv')]
    default_results = dict()
    
    top_pos = set(default_pos_lines[-1])
    top_neg = set(default_neg_lines[-1])
    
    default_results['pos'] = [len(top_pos.intersection(set(line)))/top for line in default_pos_lines]
    default_results['neg'] = [len(top_neg.intersection(set(line)))/top for line in default_neg_lines]
    
    results = defaultdict(dict)
    
    for delta in deltas:
        pos_lines = get_lines(f'{folder_name}/{delta}/pos_monitor.csv')
        neg_lines = get_lines(f'{folder_name}/{delta}/neg_monitor.csv')
        results['time'][delta] = [float(line[0]) for line in get_lines(f'{folder_name}/{delta}/time_monitor.csv')]
        results['pos'][delta] = [len(top_pos.intersection(set(line)))/top for line in pos_lines]
        results['neg'][delta] = [len(top_neg.intersection(set(line)))/top for line in neg_lines]
    
    fig, axs = plt.subplots(2, 1, figsize=(15, 18))
    axs[0].plot(default_time_lines, default_results['pos'], label = 'default (0.1)')
    axs[0].set_title(f'positive {delta}')
    axs[1].plot(default_time_lines, default_results['neg'], label = 'default (0.1)')
    axs[1].set_title(f'negative {delta}')
    
    for delta in deltas:
        axs[0].plot(results['time'][delta] , results['pos'][delta], label = str(delta))
        axs[1].plot(results['time'][delta] , results['neg'][delta], label = str(delta))
        
    for ax in axs.flat:
        ax.set(xlabel='time (minutes)', ylabel='percent')
        ax.legend()

In [None]:
deltas = [0.20, 0.35, 0.5]
present_deltas_monitor(deltas)