In [1]:
# Setup
%matplotlib inline
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import warnings
import spacy
from modified_anchor import anchor_text
import pickle
from myUtils import *
from transformer.utils import *
from dataset.dataset_loader import *
import datetime
import os

SEED = 84
torch.manual_seed(SEED)
warnings.simplefilter("ignore")

In [2]:
plt.rcParams['font.size'] = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# can be sentiment/spam/offensive
dataset_name = 'sentiment'
review_parser, label_parser, ds_train, ds_val, _ = create_sentiment_dataset()

Number of tokens in training samples: 3307
Number of tokens in training labels: 2


In [4]:
model = load_model('gru' , f'transformer/{dataset_name}/gru.pt', review_parser)
model = torch.jit.script(model)

{'embedding_dim': 100, 'batch_size': 32, 'hidden_dim': 256, 'num_layers': 2, 'dropout': 0.3, 'lr': 5e-05, 'early_stopping': 5, 'output_classes': 2}
VanillaGRU(
  (embedding_layer): Embedding(3307, 100)
  (GRU_layer): GRU(100, 256, num_layers=2, dropout=0.3)
  (dropout_layer): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)


In [5]:
spacy_tokenizer = spacy.load("en_core_web_sm")

In [6]:
# 1 = pad 2=sos 3 = eos
def tokenize(text, max_len):
    sentence = spacy_tokenizer.tokenizer(str(text))
    input_tokens = [2] + [review_parser.vocab.stoi[word.text] for word in sentence] + [3] + [1]*(max_len-len(sentence))

    return input_tokens

In [7]:
def predict_sentences(sentences):
    half_length = len(sentences)//2
    if(half_length>100):
        return np.concatenate([predict_sentences(sentences[:half_length]), predict_sentences(sentences[half_length:])])
    max_len = max([len(sentence) for sentence in sentences])
    sentences = torch.tensor([tokenize(sentence, max_len) for sentence in sentences], device=device)
    input_tokens = torch.transpose(sentences, 0, 1)
    output = model(input_tokens)

    return torch.argmax(output, dim=1).cpu().numpy()

# Anchor Part

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
explainer = anchor_text.AnchorText(nlp, ['positive', 'negative'], use_unk_distribution=False)

# Loading Results

In [10]:
test = np.array(pickle.load( open(f"{dataset_name}/test.pickle", "rb" )))
test_labels = np.array(pickle.load( open(f"{dataset_name}/test_labels.pickle", "rb" )))
explanations  = pickle.load(open(f"{dataset_name}/exps_list.pickle", "rb" ))
anchor_examples = pickle.load( open(  f"{dataset_name}/anchor_examples.pickle", "rb" ))

In [11]:
len(anchor_examples)

2274

In [12]:
len(explanations)

27424

In [13]:
if not os.path.exists(f"{dataset_name}/predictions.pickle"):
    predictions = [predict_sentences([str(anchor_example)])[0] for anchor_example in anchor_examples]
    pickle.dump( predictions, open( f"{dataset_name}/predictions.pickle", "wb" ))

In [14]:
if not os.path.exists(f"{dataset_name}/extended_exps.pickle"):
    test_predictions = np.array([predict_sentences([text])[0] for text in test])
    explanations = [ExtendedExplanation(exp, anchor_examples, test, test_labels, test_predictions ,predict_sentences, explainer) for exp in explanations if len(exp.fit_examples) > 0]
    pickle.dump( explanations, open( f"{dataset_name}/extended_exps.pickle", "wb" ))

In [15]:
explanations = pickle.load(open( f"{dataset_name}/extended_exps.pickle", "rb" ))
labels = pickle.load(open( f"{dataset_name}/predictions.pickle", "rb" ))

In [16]:
from functools import reduce
# get all anchor above 0.95, multiple in a sentence but word counts only once in a sentence
def get_best(explanations):
    best_exps = dict()
    for exp in explanations:
        if exp.precision < 0.95:
            continue
        if exp.index not in best_exps.keys():
            best_exps[exp.index]=[exp]
        # if word already appeard in sentence
        elif any([cur_exp.names[0]==exp.names[0] for cur_exp in best_exps[exp.index]]):
            continue
        else:
            best_exps[exp.index].append(exp)
    print(len(best_exps))
    return reduce(lambda x,y: x+y, best_exps.values())

In [17]:
from collections import Counter

def get_anchor_occurences(explanations):
    c = Counter()
    for exp in explanations:
        c.update([exp.names[0]])
 
    return c

def get_normal_occurences(sentences, anchor_occurences):
    c = Counter()
    for sentence in sentences:
        c.update(review_parser.tokenize(sentence))
   
    #removing occurences of the words as anchor
    for word in anchor_occurences.keys():
        c[word]-=anchor_occurences[word]
        
    return c

def smooth_before(normal_occurences, anchor_occurences_list):
    for w in normal_occurences:
        normal_occurences[w]+=1
        for anchor_occurences in anchor_occurences_list:
            anchor_occurences[w]+=1

def smooth_after(teta1, type_occurences):
    # removing words we added 1 at the start smooth
    words = list(teta1.keys())
    for word in words:
        if type_occurences[word]<=1:
            del teta1[word]
    
    min_val = min(teta1.values())
    if min_val<0:
        for w in teta1:
            teta1[w]-= min_val
        sum_val = sum(teta1.values())
        for w in teta1:
            teta1[w]= teta1[w]/sum_val

In [18]:
def calculate_teta0(normal_occurences):
    teta0 = dict()
    sum_occurences = sum(normal_occurences.values())
    for word, count in normal_occurences.items():
        teta0[word] = count/sum_occurences
    
    return teta0

def calculate_teta1(anchor_occurences, teta0, alpha):
    teta1 = dict()
    sum_occurences = sum(anchor_occurences.values())
    for word, count in anchor_occurences.items():
        teta1[word] = count/sum_occurences -(1-alpha)*teta0[word]
        teta1[word] = teta1[word]/alpha
    
    return teta1

In [19]:
def calculate_scores():
    alphas = [0.95, 0.85, 0.65, 0.5]
    dfs = []
    columns = ['name', 'anchor score',  'anchor occurences', 'normal score', 'normal occurences']
    
    exps = get_best(explanations)
    anchor_occurences = get_anchor_occurences(exps)
    normal_occurences = get_normal_occurences(anchor_examples, anchor_occurences)
    smooth_before(normal_occurences, [anchor_occurences])
    
    teta0 = calculate_teta0(normal_occurences)
    
    for alpha in alphas:
        df_list = []
        teta1 = calculate_teta1(anchor_occurences, teta0, alpha)
        smooth_after(teta1, anchor_occurences)
        
        for anchor, score in teta1.items():
            anchor_score = round(score, 5)
            normal_score = round(teta0[anchor], 5)
            df_list.append([anchor, anchor_score , anchor_occurences[anchor], normal_score, normal_occurences[anchor]]) 
            
        df_list.sort(key=lambda exp: -exp[1])
        df = pd.DataFrame(data = df_list, columns = columns).set_index('name')
        
        dfs.append(df)
        
    writer = pd.ExcelWriter(f'{dataset_name}/formalized_scores.xlsx',engine='xlsxwriter') 
    
    workbook=writer.book
    worksheet=workbook.add_worksheet('Sheet1')
    writer.sheets['Sheet1'] = worksheet
    
    cur_col = 0
    
    for df, alpha in zip(dfs, alphas):
        worksheet.write(0, cur_col, alpha)
        df.to_excel(writer, sheet_name='Sheet1', startrow=1, startcol=cur_col)
        cur_col+= len(columns) + 1

    writer.save()

In [20]:
def calculate_scores_double():
    alphas = [0.95, 0.8, 0.65, 0.5]
    dfs = []
    columns = ['name', 'anchor score',  'type', 'type occurences', 'total occurences','+%', '-%', 'both', 'normal']
    
    exps = get_best(explanations)
    pos_exps = [exp for exp in exps if labels[exp.index]==0]
    neg_exps = [exp for exp in exps if labels[exp.index]==1]
    
    anchor_occurences = get_anchor_occurences(exps)
    pos_occurences = get_anchor_occurences(pos_exps)
    neg_occurences = get_anchor_occurences(neg_exps)
    
    normal_occurences = get_normal_occurences(anchor_examples, anchor_occurences)
    smooth_before(normal_occurences, [pos_occurences, neg_occurences])

    teta0 = calculate_teta0(normal_occurences)
    
    
    for alpha in alphas:
        df_list = []
        
        teta_pos = calculate_teta1(pos_occurences, teta0, alpha)
        smooth_after(teta_pos, pos_occurences)
        
        teta_neg = calculate_teta1(neg_occurences, teta0, alpha)
        smooth_after(teta_neg, neg_occurences)
        
        # substracting 1 because of the smoothing
        for anchor, score in teta_pos.items():
            pos_percent = round((pos_occurences[anchor]-1)/anchor_occurences[anchor], 2)
            neg_percent = 1-pos_percent
            both = (pos_occurences[anchor]-1)>0 and (neg_occurences[anchor]-1)>0
            df_list.append([anchor, score , '+', pos_occurences[anchor]-1, anchor_occurences[anchor], pos_percent, neg_percent, both,  normal_occurences[anchor]-1]) 
            
        
        for anchor, score in teta_neg.items():
            pos_percent = round((pos_occurences[anchor]-1)/anchor_occurences[anchor], 2)
            neg_percent = 1-pos_percent
            both = (pos_occurences[anchor]-1)>0 and (neg_occurences[anchor]-1)>0
            df_list.append([anchor, score , '-', neg_occurences[anchor]-1, anchor_occurences[anchor], pos_percent, neg_percent, both,  normal_occurences[anchor]-1]) 
            
        df_list.sort(key=lambda exp: -exp[1])
        df = pd.DataFrame(data = df_list, columns = columns ).set_index('name')
        
        dfs.append(df)
        
    writer = pd.ExcelWriter(f'{dataset_name}/formalized_scores_double.xlsx',engine='xlsxwriter') 
    
    workbook=writer.book
    worksheet=workbook.add_worksheet('Sheet1')
    writer.sheets['Sheet1'] = worksheet
    
    cur_col = 0
    
    for df, alpha in zip(dfs, alphas):
        worksheet.write(0, cur_col, alpha)
        df.to_excel(writer, sheet_name=f'Sheet1', startrow=1, startcol=cur_col)
        cur_col+= len(columns) + 1

    writer.save()

In [21]:
calculate_scores_double()

1068


In [23]:
ord(' ')

32