In [None]:
!pip install sentence_transformers
!apt install libomp-dev
!pip install faiss

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import faiss

from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util, SentenceTransformer, losses, evaluation
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime, time
import os
import gzip
import csv

# Обучение Классификатора CrossEncoder

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Deep Learning on practice/NLP/train.csv', index_col = 'pair_id')

x_train, x_test, y_train, y_test = train_test_split(
    df[['name_1','name_2']].to_numpy(), 
    df['is_duplicate'].to_numpy(), 
    test_size=0.1, 
    random_state=69, 
    shuffle=True, 
    stratify=df['is_duplicate']
)

In [None]:
train_samples = []
# dev_samples = []
for i, row in enumerate(x_train[:100000]):
    train_samples.append(InputExample(texts=[row[0], row[1]], label=y_train[i]))

train_batch_size = 16
num_epochs = 1
model_save_path = '/content/drive/MyDrive/Deep Learning on practice/NLP/training_-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model_name = 'cross-encoder/quora-distilroberta-base'
model = CrossEncoder(model_name, num_labels=1)


train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
# evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='ABOBA-dev')


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.05) #5% of train data for warm-up

# Train the model
model.fit(train_dataloader=train_dataloader,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          use_amp=True)

Downloading:   0%|          | 0.00/148 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6219 [00:00<?, ?it/s]



In [None]:
orig_df = pd.read_csv('/content/drive/MyDrive/Deep Learning on practice/NLP/train.csv', index_col = 'pair_id')

In [None]:
# orig_ones = orig_df[orig_df['is_duplicate'] == 1]
# orig_zeros = orig_df[orig_df['is_duplicate'] == 0].sample(3658)

synthetics_ones = pd.DataFrame([row for i, row in enumerate(x_test) if y_test[i] == 1], columns=['name_1', 'name_2'])
synthetics_zeros = pd.DataFrame([row for i, row in enumerate(x_test) if y_test[i] == 0], columns=['name_1', 'name_2'])

ones = synthetics_ones
zeros = synthetics_zeros

In [None]:
def compare(df, batch_size=32):
    return model.predict(df[['name_1','name_2']].to_numpy(), batch_size=batch_size)

In [None]:
scores = compare(ones)
scores_more_05 = (scores > 0.5) * 1

all_ones = len(scores_more_05)
TP = np.sum(scores_more_05)
FN = all_ones - TP

scores = compare(zeros)
scores_less_05 = (scores <= 0.5) * 1

all_zeros = len(scores_less_05)
TN = np.sum(scores_less_05)
FP = all_zeros - TN

In [None]:
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
Precision, Recall

(0.9995581442056638, 0.9997187738540034)

In [None]:
TP/all_ones

0.9997187738540034

In [None]:
TN/all_zeros

0.9995580731991482

In [None]:
F1_score = 2 * (Precision * Recall) / (Precision + Recall)
F1_score

0.9996384525770298

# Инференс модели Классификатора CrossEncoder

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Deep Learning on practice/NLP/train.csv", index_col="pair_id")

df_companies_in_duplicates1 = df[df['is_duplicate'] == 1][['name_1', 'name_2']].to_numpy()
companies_in_duplicates1 = set(df_companies_in_duplicates1.flatten())

df_companies_in_duplicates0 = df[df['is_duplicate'] == 0][['name_1', 'name_2']].to_numpy()
companies_in_duplicates0 = set(df_companies_in_duplicates0.flatten())

companies_only_in_duplicates0 = companies_in_duplicates0.difference(companies_in_duplicates1)
companies_only_in_duplicates1 = companies_in_duplicates1.difference(companies_in_duplicates0)
companies_in_both_sets = companies_in_duplicates0.intersection(companies_in_duplicates1)

print(len(companies_only_in_duplicates0), len(companies_in_both_sets), len(companies_only_in_duplicates1))

all_unique_companies = companies_only_in_duplicates0.union(companies_only_in_duplicates1).union(companies_in_both_sets)
print("Vocalubary size: ", len(all_unique_companies))

16628 1020 374
Vocalubary size:  18022


In [52]:
# model = CrossEncoder("/content/drive/MyDrive/Deep Learning on practice/NLP/model_Nikita")
model = CrossEncoder("/content/drive/MyDrive/Deep Learning on practice/NLP/model_Zhenya")

corpus = list(all_unique_companies)[1:101]
query = "Джей Икс Ниппон Оил & Газ"

start_time = datetime.now()

sentences_combinations = [ [query, corpus_sentence] for corpus_sentence in corpus ]

similarity_scores = model.predict(sentences_combinations)
sorted_indexes = list(reversed(np.argsort(similarity_scores)))

print("Elapsed time: ", datetime.now() - start_time)

print("Query: ", query)
for index in sorted_indexes[:100]:
    print("{}\t{}".format(similarity_scores[index], corpus[index]))

Elapsed time:  0:00:00.292887
Query:  Джей Икс Ниппон Оил & Газ
0.0036430018953979015	IMPER
0.00330098788253963	DIH WEI
0.003111292375251651	Performance Brands
0.0029488657601177692	Distributionnow
0.002906346693634987	MAKYOL
0.0028811150696128607	COMERCIAL QUIMICA S.A.
0.0024721950758248568	PRASAD
0.002469714032486081	RK OVERSEA
0.0023324890062212944	ASHRON FREIGHT
0.0022913161665201187	AL ASHW
0.002259640721604228	CEVA FREIGHT
0.002258529420942068	SHANGHAI YAD
0.0022176860366016626	POLIURETANO COMERCIAL COSTA RICA S.A
0.0022008700761944056	AL MAITHA
0.002122299512848258	Warren Plant
0.002079807221889496	Amster S.A.
0.0020073633641004562	TRANSWORLD
0.0019084929954260588	ALHEBAN IMP
0.001788872992619872	Cms Shipping
0.0017574229277670383	Ocean Express International
0.0017500584945082664	Bank Of America
0.0016981085063889623	Euro Pack Inc.
0.0016010436229407787	Tri Union Seafoods
0.0015532684046775103	MEXICO C.V
0.0015470964135602117	INNOVATIVE BEDDING SOLUT
0.0015131032560020685	APX BR

In [50]:
corpus[0]

nan

# Обучение SBERT (Bi-Encoder) - создание эмбедингов предложений

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Deep Learning on practice/NLP/companies_50_50.csv", index_col="pair_id")

df_dev = df_train.sample(256)
df_train.drop(df_dev.index, inplace=True)

train_samples = []
for idx, row, in df_train.iterrows():
    train_samples.append(InputExample(texts=[row["name_1"], row["name_2"]], label=float(row["is_duplicate"])))
    train_samples.append(InputExample(texts=[row["name_2"], row["name_1"]], label=float(row["is_duplicate"])))

In [None]:
# 1) distiluse-base-multilingual-cased-v1: 
#   Multilingual knowledge distilled version of multilingual Universal Sentence Encoder. 
#   Supports 15 languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish.

# 2) distiluse-base-multilingual-cased-v2: 
#   Multilingual knowledge distilled version of multilingual Universal Sentence Encoder. 
#   This version supports 50+ languages, but performs a bit weaker than the v1 model.

# 3) paraphrase-multilingual-MiniLM-L12-v2:
#   Multilingual version of paraphrase-MiniLM-L12-v2, trained on parallel data for 50+ languages.

embedding_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
model_save_path = '/content/drive/MyDrive/Deep Learning on practice/NLP/embedding_model_cased-v1-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

num_epochs = 1
batch_size = 32

optimizer = torch.optim.AdamW
optimizer_params = {'lr': 1e-4, 'eps': 1e-6}

train_dataloader = DataLoader(
    train_samples, 
    shuffle=True, 
    batch_size=batch_size
)
train_loss = losses.CosineSimilarityLoss(embedding_model)
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    df_dev.name_1.tolist(),
    df_dev.name_2.tolist(),
    df_dev.is_duplicate.tolist(),
    batch_size=batch_size, 
    name='256-samples-dev'
)

evaluation_steps = int(len(train_dataloader) * 0.1)
warmup_steps = int(len(train_dataloader) * num_epochs * 0.05)

embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    scheduler='WarmupLinear',
    warmup_steps=warmup_steps,
    evaluator=evaluator,
    evaluation_steps=evaluation_steps,
    optimizer_class=optimizer,
    optimizer_params=optimizer_params,
    use_amp=True,
    save_best_model=True,
    output_path=model_save_path
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/31098 [00:00<?, ?it/s]

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/drive/MyDrive/Deep Learning on practice/NLP/train.csv", index_col="pair_id")

df_companies_in_duplicates1 = df[df['is_duplicate'] == 1][['name_1', 'name_2']].to_numpy()
companies_in_duplicates1 = set(df_companies_in_duplicates1.flatten())

df_companies_in_duplicates0 = df[df['is_duplicate'] == 0][['name_1', 'name_2']].to_numpy()
companies_in_duplicates0 = set(df_companies_in_duplicates0.flatten())

companies_only_in_duplicates0 = companies_in_duplicates0.difference(companies_in_duplicates1)
companies_only_in_duplicates1 = companies_in_duplicates1.difference(companies_in_duplicates0)
companies_in_both_sets = companies_in_duplicates0.intersection(companies_in_duplicates1)

print(len(companies_only_in_duplicates0), len(companies_in_both_sets), len(companies_only_in_duplicates1))

all_unique_companies = companies_only_in_duplicates0.union(companies_only_in_duplicates1).union(companies_in_both_sets)
print("Vocalubary size: ", len(all_unique_companies))

16628 1020 374
Vocalubary size:  18022


# Поиск по косинусному сходству semantic_search

In [None]:
embedder = embedding_model

In [None]:
corpus = list(all_unique_companies)

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [None]:
queries = ['Nippon Oil', 'Nipon', 'Nipponyal', 'Nepon', 'JX Nippon', 'Michlen', 'Мишлен']

for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    print("\n\n=======================================")
    print("Query:", query)
    print("\nTop the most similar sentences in corpus:")

    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=50)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print("(Score: {:.4f})\t".format(hit['score']), corpus[hit['corpus_id']])

# Оценка всего алгоритма поиска дубликатов на тестовой выборке "companies_test.csv" 

Bi-Encoder(top 50) -> Cross-Encoder(top 5)

Загрузка тестового датасета:

In [28]:
df = pd.read_csv("/content/drive/MyDrive/Deep Learning on practice/NLP/train.csv", index_col="pair_id")

all_unique_companies = list(set(df[["name_1", "name_2"]].to_numpy().flatten()))[1:]

Загрузка моделей и формирование базы данных (словаря) для поиска:

In [30]:
embedder = SentenceTransformer("/content/drive/MyDrive/Deep Learning on practice/NLP/embedding_model")
classificator = CrossEncoder("/content/drive/MyDrive/Deep Learning on practice/NLP/model_Zhenya")

corpus_sentences = all_unique_companies[:]
corpus_embeddings = embedder.encode(corpus_sentences)
db_SentenceToVector = { sentence: corpus_embeddings[i] for i, sentence in enumerate(corpus_sentences) }

# import pickle
# cached_corpuses = "/content/drive/MyDrive/Deep Learning on practice/NLP/train-embeddings"
# with open(cached_corpuses, "wb") as file:
#     cache = {"sentences": corpus_sentences, "embeddings": corpus_embeddings}
#     pickle.dump(cache, file)

Проходимся по всему тестовому набору и высчитаем F1 меру (TP, FP, TN, FN):

In [19]:
TP,FP,TN,FN = 0,0,0,0

def find_kNN_sentences_by_vectors(vector_query, corpus_embeddings, k=50):
    hits = util.semantic_search(vector_query, corpus_embeddings, top_k=k)[0]

    return [ corpus_sentences[hit['corpus_id']] for hit in hits ]

def find_k_most_similar_sentences(sentence_query, sentence_subset, model, k=5):
    sentences_combinations = [ [sentence_query, s] for s in sentence_subset ]

    similarity_scores = model.predict(sentences_combinations)
    sorted_indexes = list(reversed(np.argsort(similarity_scores)))

    return [ sentence_subset[index_max_score] for index_max_score in sorted_indexes[:k] ]

def is_duplicates_pred(name_1, name_2, classificator=classificator, top_k_vectors = 25, top_k_setnences = 14):
    # find 100 nearest sentences through cosine similarity of vectors
    name_1_vector = db_SentenceToVector[name_1]
    predicted_nearest_sentences = find_kNN_sentences_by_vectors(
        name_1_vector, 
        corpus_embeddings, 
        k=top_k_vectors
    )

    # find 5 most similar sentences using classification
    pred_nearest_sentences = find_k_most_similar_sentences(
        name_1,
        predicted_nearest_sentences,
        classificator,
        k=top_k_setnences
    )

    return name_2 in pred_nearest_sentences

start = datetime.now()
count = 0
for index_label, row in df.iterrows():
    if count > 0:
        break
    else:
        count += 1
        print(f"Row: {index_label}, Elapsed: {datetime.now() - start}")
    name_1 = row["name_1"]
    name_2 = row["name_2"]
    is_duplicate_actual = True if row["is_duplicate"] == 1 else False


    if is_duplicates_pred(name_1, name_2):
        if is_duplicate_actual:
            TP += 1
        else:
            FP += 1
    else:
        if is_duplicate_actual:
            FN += 1
        else:
            TN += 1


Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1_score = 2 * Precision * Recall / (Precision + Recall)

Precision, Recall, F1_score

(0.9942738985207571, 0.1786, 0.3028071790151864)

In [20]:
TP,FP,TN,FN

(6251, 36, 34964, 28749)

In [None]:
import pickle

embeddings_cache_path = "/content/drive/MyDrive/Deep Learning on practice/NLP/train-embeddings"
with open(embeddings_cache_path, "rb") as file:
    cached_data = pickle.load(file)
    corpus_sentences = cached_data["sentences"]
    corpus_embeddings = cached_data["embeddings"]

# embedder = SentenceTransformer("/content/drive/MyDrive/Deep Learning on practice/NLP/embedding_model")

embedding_size = 512
#Number of clusters used for faiss. Select a value 4*sqrt(N) to 16*sqrt(N) - https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index
n_clusters = 1024
#We use Inner Product (dot-product) as Index. We will normalize our vectors to unit length, then is Inner Product equal to cosine similarity
quantizer = faiss.IndexFlatIP(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)

# First, we need to normalize vectors to unit length
normalized_corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]

# Then we train the index to find a suitable clustering
index.train(normalized_corpus_embeddings)
# Finally we add all embeddings to the index
index.add(normalized_corpus_embeddings)

faiss.write_index(index, "index")
index = faiss.read_index("index")

######### Search in the index ###########
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
top_k_hits = 10
while True:
    input_question = input("Please enter a question: ")

    start_time = datetime.now()
    question_embedding = embedder.encode(input_question)

    #FAISS works with inner product (dot product). When we normalize vectors to unit length, inner product is equal to cosine similarity
    question_embedding = question_embedding / np.linalg.norm(question_embedding)
    question_embedding = np.expand_dims(question_embedding, axis=0)

    # Search in FAISS. It returns a matrix with distances and corpus ids.
    distances, corpus_ids = index.search(question_embedding, top_k_hits)

    # We extract corpus ids and scores for the first query
    hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    end_time = datetime.now()

    print("Input question:", input_question)
    print("Results (after {} seconds):".format(end_time-start_time))
    for hit in hits[0:top_k_hits]:
        print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))

    # Approximate Nearest Neighbor (ANN) is not exact, it might miss entries with high cosine similarity
    # Here, we compute the recall of ANN compared to the exact results
    correct_hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k_hits)[0]
    correct_hits_ids = set([hit['corpus_id'] for hit in correct_hits])

    ann_corpus_ids = set([hit['corpus_id'] for hit in hits])
    if len(ann_corpus_ids) != len(correct_hits_ids):
        print("Approximate Nearest Neighbor returned a different number of results than expected")

    recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))

    if recall < 1:
        print("Missing results:")
        for hit in correct_hits[0:top_k_hits]:
            if hit['corpus_id'] not in ann_corpus_ids:
                print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))
