In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import tensorflow as tf
from transformers import RobertaTokenizerFast, TFRobertaModel
import joblib
from sklearn.model_selection import train_test_split
import _pickle as pickle
import gc

pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 300)


def pmap(func, x, n_jobs = 32):
    result = joblib.Parallel(n_jobs = n_jobs, prefer = 'processes', verbose = 0)(joblib.delayed(func)(args) for args in tqdm(x))
    return result

tf.config.list_physical_devices()


In [None]:
def get_metadata():
    with open('arxiv-metadata-oai-snapshot.json', 'r') as f:
        for line in f:
            yield line

In [None]:
metadata = get_metadata()

In [None]:
keys = ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']

titles_tags_dict = {k:[] for k in keys}
count = 0
for paper in tqdm(metadata, total = 1700000):
    parsed = json.loads(paper)
    for k in keys:
        titles_tags_dict[k].append(parsed[k])
#     titles_tags_dict["title"].append(parsed['title'])
#     titles_tags_dict["tags"].append(parsed['categories'])
#     count += 1

In [None]:
# def load_news(start_year, end_year):
#     df = []
#     for i in tqdm(range(start_year, end_year)):
#         for j in tqdm(range(1,13), leave = False):
#             try:
#                 df.append(pd.read_parquet(f'./news/{i}_{j}_news.parquet'))
#             except:
#                 pass
#     return pd.concat(df).dropna(subset = ["article", "title"])

# df_news = load_news(2016,2021)

# print('Loaded')
# df_news['len'] = pmap(lambda x : len(str(x).split()), df_news['article'].values, n_jobs = 32)

# print('Len computed')

# print(df_news.shape)
# df_news.to_parquet('news_with_len.parquet')

In [None]:

from sklearn.cluster import KMeans
import tensorflow as tf

max_len = 64
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def random_crop(x, max_len = 256, min_len = 32):
    x = str(x).split()
    max_len = min(max_len, len(x))
    min_len = min(min_len, max_len - 1)
    
    st = np.random.randint(0, max_len - min_len)
    en = np.random.randint(min_len, max_len)
    x = x[st:st+en]
    x = " ".join(x)
    return x

class BertEmbedding(tf.keras.Model):
    def __init__(self, max_len = 128, encoder_weights = None):
        super(BertEmbedding, self).__init__()
        self.max_len = max_len
        self.encoder = TFRobertaModel.from_pretrained('roberta-base')
            
        self.dense1 = tf.keras.layers.Dense(512, activation = 'relu')
        self.dense2 = tf.keras.layers.Dense(512, activation = 'linear')
        
        dummy_input = {
            'input_ids' : np.random.randint(0,100,size = (8, self.max_len)).astype('int32'),
            'attention_mask' : np.random.randint(0,2,size = (8, self.max_len)).astype('int32'),
        }
        dummy = self(dummy_input)
        
    
    def call(self, x, training = True):
        encoded = self.encoder(input_ids = x['input_ids'], 
                               training =  training, 
                               attention_mask = x['attention_mask'],
                              )[0][:,0,:]

        
        embedding = self.dense1(encoded)
        embedding = self.dense2(embedding)
        return embedding
            
class SiameseBert(tf.keras.Model):
    def __init__(self, max_len = 128, encoder_weights = None):
        super(SiameseBert, self).__init__()
        self.max_len = max_len
        self.encoder = BertEmbedding(max_len)
        self.cosine = tf.keras.layers.Dot(axes = -1, normalize=True)
        
        if encoder_weights:
            self.encoder.load_weights(encoder_weights)
        
        dummy_input = [{
            'input_ids' : np.random.randint(0,100,size = (8, self.max_len)).astype('int32'),
            'attention_mask' : np.random.randint(0,2,size = (8, self.max_len)).astype('int32'),
        },
        {
            'input_ids' : np.random.randint(0,100,size = (8, self.max_len)).astype('int32'),
            'attention_mask' : np.random.randint(0,2,size = (8, self.max_len)).astype('int32'),
        }]
        
        dummy = self(dummy_input)
        
    def call(self, x, training = True):
        
        encoded_a = self.encoder(x[0])
        encoded_b = self.encoder(x[1])
        
        cosine_sim = self.cosine([encoded_a, encoded_b])
        cosine_sim = (cosine_sim + 1.001)/2.002
        
        return cosine_sim

class DataGen(tf.keras.utils.Sequence):
    def __init__(self, articles, batch_size=  32, max_len = 256):
        self.corpus = articles
        self.batch_size = batch_size
        self.max_len = max_len
        self.n_sample = len(self.corpus)
    
    def __len__(self):
        return 1000000
    
    def __getitem__(self, index):
        ## Sampling data
        idx = np.random.randint(0, self.n_sample -1, size = self.batch_size)
        sample = self.corpus[idx]
        
        ## Creating Target Array with half zero then half 1
        y = np.zeros(self.batch_size)
        b = self.batch_size // 2
        y[b:] += 1
        
        idx_dum = np.random.randint(0, self.n_sample -1, size = b)
        sample_dum = self.corpus[idx_dum]
        
        ## Creating two random augmentation of each sample articles
        p1 = np.array([random_crop(elt, max_len = self.max_len) for elt in sample])
        p2 = np.array([random_crop(elt, max_len = self.max_len) for elt in sample])
        p3 = np.array([random_crop(elt, max_len = self.max_len) for elt in sample_dum])
        
        ## Shuffling the first half of the second augmentation to have different article on average (this also creates some negative example by default)
        shuffle_id = np.array(list(range(b)))
        np.random.shuffle(shuffle_id) 
        
        p2[np.array(list(range(b)))] = p3
        
        ## Shuffling the whole batch
        shuffle_id = np.array(list(range(self.batch_size)))
        np.random.shuffle(shuffle_id) 
        p1 = p1[shuffle_id]
        p2 = p2[shuffle_id]
        y = y[shuffle_id]
        
        ## Tokenization
        p1 = tokenizer.batch_encode_plus(list(p1),add_special_tokens = True, max_length = self.max_len, truncation = True, return_tensors = 'np', padding  = 'max_length')
        p2 = tokenizer.batch_encode_plus(list(p2),add_special_tokens = True, max_length = self.max_len, truncation = True, return_tensors = 'np', padding  = 'max_length')
        p1 = {elt:p1[elt].astype('int32') for elt in p1}
        p2 = {elt:p2[elt].astype('int32') for elt in p2}
        
        ## Model inputs creation
        inputs_a = {
            'input_ids': p1['input_ids'],
            'attention_mask': p1['attention_mask']}
        inputs_b = {
            'input_ids':  p2['input_ids'],
            'attention_mask': p2['attention_mask']
        }
        inputs = inputs_a, inputs_b
        return inputs, y

In [None]:
%%time
with open('data.pickle', 'rb') as f:
    df_news = pickle.load(f)
    
df_news['article'] = df_news['title'] + ' </s> ' + df_news['abstract'] 
df_news['len'] = df_news['article'].apply(lambda x : len(str(x).split())*1.2)

In [None]:
df_news = df_news[df_news['len']>= 2 * max_len]

In [None]:
df_news.reset_index(inplace = True, drop = True)
idx = np.random.randint(0, df_news.shape[0], size = min(df_news.shape[0], 100000))
X = df_news['article'].iloc[idx].values
print(X.shape)
X_train, X_test, _, _ = train_test_split(X, np.zeros(len(X)), test_size=0.2, random_state=42)

# del df_news, df_news_bis
# gc.collect()

In [None]:
df_news.shape

In [None]:
model = SiameseBert(max_len = max_len)#, encoder_weights = './models/siamese_encoder_256.tf')

model.compile(
    loss = 'binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(3e-5, 1e-8),
    metrics = ['accuracy', 'AUC']
)

In [None]:
batch_size = 64
train_gen = DataGen(X_train, batch_size = batch_size, max_len = max_len)
test_gen = DataGen(X_test, batch_size = 4096, max_len = max_len)
x_test,  y_test = test_gen[0]

In [None]:
%%timeit
x,y = train_gen[0]

In [None]:
early = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0.0001, patience=7, verbose=1, 
                                                mode='max', restore_best_weights=True)
reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.1, patience=3, verbose=1, 
                                                     mode='max', min_delta=0.0001, cooldown=0, min_lr=0)


validation_batch_size = 4*batch_size
n_epochs = 10
steps_per_epoch = int(50000/batch_size) + 1

model.fit(train_gen, 
          epochs=n_epochs,
          steps_per_epoch = steps_per_epoch,       
          validation_data=(x_test,  y_test),       
          validation_batch_size = validation_batch_size,     
          callbacks = [early, reduce],
                   )

In [None]:
model.encoder.save_weights('./models/sci-siamese_encoder_64.tf')

In [None]:
# model.encoder.load_weights('./models/siamese_encoder_128.tf')