In [None]:
import numpy as np
import random
import json
import pandas as pd
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pickle

In [None]:
## To avoid Cuda out of Memory Error (if doesn't work, try reducing batch size)
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
from bert_score import BERTScorer
# from evaluate import load

In [None]:
class FeatureExtractor:

    @staticmethod
    def gold_score(cand, target, bertscore):
        # bs = bertscore.compute(predictions=[cand], references=[target], lang="en", model_type="distilbert-base-uncased")['f1'][0]
        _, _, bs = bertscore.score(cands=[cand], refs=[target])
        # bs = FeatureExtractor.jaccard_similarity(cand, target)
        
        len_target = len(word_tokenize(target))
        phi_v = len(word_tokenize(cand)) * 1.0 / len_target
        phi_y = len(word_tokenize(target)) * 1.0 / len_target
        gs = np.exp(abs(phi_v - phi_y)) * bs
        return gs

    @staticmethod
    def is_valid_candidate(cand, src):
        compression_ratio = len(cand) / len(src)
        if compression_ratio < 0.5 or compression_ratio > 2.0:
            return False
        return True

    @staticmethod
    def jaccard_similarity(cand, label) :
        cand_set, label_set = set(cand), set(label)
        intersection = cand_set & label_set
        union = cand_set | label_set
        return len(intersection) / len(union)

    @staticmethod
    def get_feature_vector(cand, src):
        features = []
        word_tok_src, word_tok_cand = word_tokenize(src.lower()), word_tokenize(cand.lower())
        wc_src, wc_cand = len(word_tok_src), len(word_tok_cand)
        sc_cand = len(sent_tokenize(cand))
        features.append(sc_cand * 1.0)
        features.append(wc_src * 1.0)
        features.append(FeatureExtractor.jaccard_similarity(word_tok_cand, word_tok_src))
        features.append(wc_cand * 1.0 / wc_src)
        features.append(wc_cand * 1.0 / sc_cand)
        return features

In [None]:
class GaussinaBinner :
    def __init__(self, bin_count, gamma) -> None:
        self.bin_count = bin_count
        self.gamma = gamma
        self.means = None
        self.sigmas = None

    @staticmethod
    def gaussian_distance(val, mean, sigma) :
        return np.exp(-np.power(val - mean, 2.) / (2 * sigma * sigma))

    def create_bins(self, x) :
        feature_count = x.shape[1]
        self.means = []
        self.sigmas = []
        for feature in range(feature_count) :
            feature_vector = x[:, feature]
            feature_min, feature_max = np.min(feature_vector), np.max(feature_vector)
            bin_width = (feature_max - feature_min) / self.bin_count
            bins = np.arange(self.bin_count + 1) * bin_width + feature_min
            mean = np.array([bins[i] + bin_width / 2 for i in range(self.bin_count)])
            sigma = bin_width * self.gamma
            self.means.append(mean)
            self.sigmas.append(sigma)
        self.means = np.array(self.means)
        self.sigmas = np.array(self.sigmas)
    
    def generate_vectors(self, x) :
        x = np.array(x)
        x_reshaped = np.tile(x, (self.bin_count, 1, 1))
        means_reshaped = np.tile(self.means.T.reshape(self.bin_count, 1, -1), (1, x.shape[0], 1))
        sigmas_reshped = np.tile(self.sigmas, (self.bin_count, x.shape[0], 1))
        gaussian = self.gaussian_distance(x_reshaped, means_reshaped, sigmas_reshped)
        return np.roll(gaussian, 1, 0).reshape(x.shape[0], -1) 
            

In [None]:
class Ranking_Dataset(Dataset) :
    def __init__(self, data, binner, x_path=None, y_path=None) :
        # data = json.load(open(data_path, "r"))
        
        if x_path is None and y_path is None :
            x = []
            y = []
            
            # self.bert_score = load("bertscore")
            self.bert_score = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
            
            i = 0
            for x_i, v_x_i in data.items() :
                x.append([])
                y.append([])
                
                i+=1
                
                # print(f'current_row: {i}')
                
                for v_x_i_j in v_x_i['cand'] :
                    x[-1].append(FeatureExtractor.get_feature_vector(v_x_i_j, x_i))
                    y[-1].append(FeatureExtractor.gold_score(v_x_i_j, v_x_i['label'], self.bert_score))
            pickle.dump(x, open('x_stored.pkl', "wb"))
            pickle.dump(y, open('y_stored.pkl', "wb"))

        else :
            x = pickle.load(open(x_path, "rb"))
            y = pickle.load(open(y_path, "rb"))
        
        self.y = y
        self.V = []
        
        for v_i in x :
            self.V.append(binner.generate_vectors(v_i))
            
        self.n_samples = len(self.V)
        self.n_features = 10
        
#         print(f'y: {len(self.y)}')
#         print(f'y[0]: {self.y[0]}')
#         print()
        
#         print(f'V: {len(self.V)}')
#         print(f'V[0]: {self.V[0]}')
#         print()
        

    def __len__(self) :
        return self.n_samples * self.n_features * self.n_features

    def __getitem__(self, idx) :
        x_idx = idx // (self.n_features ** 2)
        f_idx = idx % (self.n_features ** 2)
        
        f_i, f_j = f_idx // self.n_features, f_idx % self.n_features
        
        try:
            y = abs(self.y[x_idx][f_i] - self.y[x_idx][f_j])
        except Exception as e:
            print(f'idx: {idx}')
            print(f'n_features: {self.n_features}')
            
            print(f'x_idx: {x_idx}')
            print(f'f_idx: {f_idx}')
            
            print(f'(f_i, f_j): ({f_i}, {f_j})')
            
            print()
        
        return torch.Tensor(self.V[x_idx][f_i]), torch.Tensor(self.V[x_idx][f_j]), torch.Tensor([y])

In [None]:
class Ranker:
    def __init__(self, binner=None, model=None) -> None:
        
        self.binner = binner
        self.model = model
        # self.bertscore = load("bertscore")
        
        self.make_model(50, 50, 1)
        
    def make_model(self, in_dim, hidden_dim, out_dim, dropout=0.2) :
        self.model = nn.Sequential(
            torch.nn.Linear(in_dim, hidden_dim),
            torch.nn.Tanh(),
            torch.nn.Dropout(p=dropout),
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.Tanh(),
            torch.nn.Dropout(p=dropout),
            torch.nn.Linear(hidden_dim, hidden_dim),
            torch.nn.Tanh(),
            torch.nn.Dropout(p=dropout),
            torch.nn.Linear(hidden_dim, out_dim)
        )


    def load_model(self, model_path):
        self.model = torch.load(model_path)
        return self.model
    

    def save_model(self, model_path):
        torch.save(self.model, model_path)

    
    def make_binner(self, data) :
        x = []
        for x_i, v_x_i in data.items() :
            for v_x_i_j in v_x_i['cand'] :
                x.append(FeatureExtractor.get_feature_vector(v_x_i_j, x_i))
        x = np.array(x)
        binner = GaussinaBinner(10, 1.0)
        binner.create_bins(x)
        
        self.binner = binner
        
    def load_data(self, data):
        ds = Ranking_Dataset(data, self.binner)
        self.dl = DataLoader(ds, batch_size=32, shuffle=False)

    def train_model(self, n_epochs, lr) :
        
        loss_fn = torch.nn.MarginRankingLoss(margin=1.0)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        # if self.binner is None :
        #     self.make_binner(data_path)

        for epoch in range(n_epochs) :
            itr_count = 0
            for x_i, x_j, y in self.dl :
                itr_count += 1
                
                optimizer.zero_grad()
                
                y_i = self.model(x_i)
                y_j = self.model(x_j)
                
                loss = loss_fn(y_i, y_j, y)
                loss.backward()
                optimizer.step()
                
                if itr_count % 1000 == 0: 
                    print(f"Epoch: {epoch} ; Itr: {itr_count // 1000} ; Loss: {loss.item()}")
                
            print(f"Epoch {epoch} : Loss = {loss.item()}")

In [None]:
data = pickle.load(open("dict1", "rb"))
dict2 = pickle.load(open("dict2", "rb"))
dict3 = pickle.load(open("dict3", "rb"))

data.update(dict2)
data.update(dict3)

len(data)

In [None]:
ranker = Ranker()

In [None]:
ranker.make_binner(data)

In [None]:
ranker.load_data(data)

In [None]:
ranker.train_model(5, 0.001)