In [None]:
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from transformers import *
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler

from tqdm import tqdm, trange

from transformers import AutoTokenizer, AutoModel

import time
import os

import unicodedata
import re
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFKC', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)

    s = re.sub(r"[^a-zA-Z؀-ۿ?.!,¿]+", " ", s)
    s = re.sub(r"([.!?])", r" \1", s)
    # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# encode the training data for each domain
def convert_data_to_features(data, tokenizer,
                                      max_length=200,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    # if max_lines_to_encode:
    #     examples = open(file_path, 'r').readlines()[:max_lines_to_encode]
    # else:
    #     examples = open(file_path, 'r').readlines()
    features = []

    for idx in (data.index):
        sent = normalizeString(data["target_lang"][idx])
        if idx % 10000 == 0:
            print("Writing example %d" % (idx))
        inputs = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            truncation=True,
            max_length=max_length)
        input_ids = inputs["input_ids"]
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        # token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        # assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,

                              # token_type_ids=token_type_ids,
                              ))


    return features

def features_to_tensor_dataset(features):
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
 
    dataset = TensorDataset(all_input_ids, all_attention_mask)
    return dataset


def run_batched_inference(tensor_dataset, model,batch_size=100):
   
    # setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == 'cpu':
        print('using cpu!!!')
    else:
        print('using gpu.')
    model.to(device)

    # Start inference loop and sample data sequentially starting from index 0
    inference_sampler = SequentialSampler(tensor_dataset)
    inference_dataloader = DataLoader(tensor_dataset, sampler=inference_sampler, batch_size=batch_size)
    avg_pooled_all = []

    for batch in tqdm(inference_dataloader, desc="Running inference..."):
        batch = tuple(t.to(device) for t in batch)
        model.eval()
        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1]}
                        'labels': batch[3]}
            
            outputs = model(**inputs)[0] # of shape [batch_size, seq_len, state_size]

            # Compute avg pooling
            avg_pooled_batch = []
            for i in range(outputs.shape[0]):
                seq_len = inputs['attention_mask'][i].sum().item()
                avg_pooled_batch.append(outputs[i][range(seq_len),:].mean(dim=0).cpu().numpy())
            avg_pooled_batch = np.stack(avg_pooled_batch)
            avg_pooled_all.append(avg_pooled_batch)


    avg_pooled_all = np.concatenate(avg_pooled_all)

    print(avg_pooled_all.shape)
    return avg_pooled_all



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def encode_data_and_save(data,tokenizer, model, batch_size =512):
 
    input_features = convert_data_to_features(data, tokenizer,
                                                   max_length=200,
                                                   )
    tensor_dataset = features_to_tensor_dataset(input_features)
    start = time.time()
    avg_pooled = run_batched_inference(tensor_dataset,model, batch_size=batch_size)
    end = time.time()

    return avg_pooled

In [None]:
def run_batched_inference_sent(tensor_dataset, model,batch_size=100):
    # Load pretrained model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == 'cpu':
        print('using cpu!!!')
    else:
        print('using gpu.')
    model.to(device)

    # Start inference loop and sample data sequentially starting from index 0
    inference_sampler = SequentialSampler(tensor_dataset)
    inference_dataloader = DataLoader(tensor_dataset, sampler=inference_sampler, batch_size=batch_size)
    avg_pooled_all = []

    for batch in tqdm(inference_dataloader, desc="Running inference..."):
        batch = tuple(t.to(device) for t in batch)
        model.eval()
        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1]}
#                       
            outputs = model(**inputs)[0] # of shape [batch_size, state_size]

            # Compute avg pooling
            avg_pooled_batch = []
            for i in range(outputs.shape[0]):
                seq_len = inputs['attention_mask'][i].sum().item()
                avg_pooled_batch.append(outputs[i].cpu().numpy())
            avg_pooled_batch = np.stack(avg_pooled_batch)
            avg_pooled_all.append(avg_pooled_batch)


    avg_pooled_all = np.concatenate(avg_pooled_all)

    print(avg_pooled_all.shape)
    return avg_pooled_all

def encode_data_and_save_sent(data,tokenizer, model, batch_size =512):
    # tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-large-arabertv02')
    input_features = convert_data_to_features(data, tokenizer,
                                                   max_length=200,
                                                   )
    tensor_dataset = features_to_tensor_dataset(input_features)
    start = time.time()
    avg_pooled = run_batched_inference(tensor_dataset,model, batch_size=batch_size)
    end = time.time()
    # os.system('mkdir -p {}'.format('/'.join(output_path.split('/')[:-1])))
    # print('encoded in {} seconds'.format(end - start))
    # np.save(output_path, avg_pooled)
    return avg_pooled

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
from peft import get_peft_model
from peft import LoraConfig, TaskType

# Load Arabert_model
model_arabert = AutoModel.from_pretrained('aubmindlab/bert-large-arabertv02')
tokenizer_arabert = AutoTokenizer.from_pretrained('aubmindlab/bert-large-arabertv02')
lora_config = LoraConfig(task_type=TaskType.FEATURE_EXTRACTION, r=64, lora_alpha=1, lora_dropout=0.01)
model_arabert_lora = get_peft_model(model_arabert, lora_config)

checkpoint = torch.load("./checkpoint_sent_arabert_large.pt")
model_arabert_lora.load_state_dict(checkpoint["model_dict"])
# optimizer.load_state_dict(checkpoint["optimizer_dict"])
# loss = checkpoint["loss"]

In [None]:
import pandas as pd
data_all = pd.read_csv("./data_selected_translated_filtered_all", sep ="\t")

avg_pooled_domain = encode_data_and_save(data_all, tokenizer_arabert, model_arabert_lora)

Writing example 0
Writing example 10000
Writing example 20000
Writing example 30000
Writing example 40000
Writing example 50000
using gpu.


Running inference...: 100%|██████████| 112/112 [36:40<00:00, 19.65s/it]


(57143, 1024)


In [None]:
save_format = './corpora_sent_arabert_large_all.npy'
np.save(save_format, avg_pooled_domain)

In [None]:
import pandas as pd
domain = ["Iraqi", "Levantine", "Gulf", "Nile_Basin", "North_Africa"]
data_domain = []

for i in range(len(domain)):
  data_domain.append(pd.read_csv("./data_{}_preprocessed_train".format(domain[i]), sep="\t"))


for i in range(len(data_domain)):
  avg_pooled_domain = encode_data_and_save(data_domain[i], tokenizer_arabert, model_arabert_lora)
  save_format = './{}_sent_arabert_large_in_domain.npy'.format(domain[i])
  np.save(save_format, avg_pooled_domain)

Writing example 0
using gpu.


Running inference...: 100%|██████████| 6/6 [01:52<00:00, 18.77s/it]


(2922, 1024)
Writing example 0
Writing example 10000
using gpu.


Running inference...: 100%|██████████| 34/34 [10:54<00:00, 19.25s/it]


(17002, 1024)
Writing example 0
using gpu.


Running inference...: 100%|██████████| 19/19 [06:10<00:00, 19.51s/it]


(9659, 1024)
Writing example 0
using gpu.


Running inference...: 100%|██████████| 20/20 [06:22<00:00, 19.11s/it]


(9946, 1024)
Writing example 0
Writing example 10000
Writing example 20000
using gpu.


Running inference...:  45%|████▍     | 26/58 [08:32<10:30, 19.70s/it]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
save_format_corpra = './corpora_sent_arabert_large_all.npy'
corpra_pooled = np.load(save_format_corpra)

for i in range(len(domain)):
  save_format_domain = './{}_sent_arabert_large_in_domain.npy'.format(domain[i])
  in_domain_pooled = np.load(save_format_domain)
  dev_centroid = np.mean(in_domain_pooled, axis = 0)
  data_all["score_{}".format(domain[i])] = [0 for _ in range(data_all.shape[0])]
  for j in range(corpra_pooled.shape[0]):
    vecs = corpra_pooled[j].reshape(1,-1)
    corpra_sent = data_all.target_lang[j]
    score = cosine_similarity(dev_centroid.reshape(1, -1), vecs)[0][0]
    data_all["score_{}".format(domain[i])][j] = score
