Model Name: distilbert-base-uncased

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('c:/Users/anama_1lv/Desktop/Veridion/Company_Classifier/CSV/preped_data.csv')
labels = pd.read_csv('c:/Users/anama_1lv/Desktop/Veridion/Company_Classifier/CSV/labels.csv')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [5]:
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
#7 min to run

data['d_embedding'] = data['description'].apply(lambda x: generate_embeddings(x) if isinstance(x, str) else None)
data['b_embedding'] = data['business_tags'].apply(lambda x: generate_embeddings(x) if isinstance(x, str) else None)
labels['l_embedding'] = labels['label'].apply(lambda x: generate_embeddings(x) if isinstance(x, str) else None)

In [7]:
description_similarity = []
business_tags_similarity = []

In [8]:
for i in range(data.shape[0]):
    if data['d_embedding'][i] is not None:
        description_similarity.append(cosine_similarity([data['d_embedding'][i]], labels['l_embedding'].tolist())[0])
    else:
        description_similarity.append([0] * len(labels))
    if data['b_embedding'][i] is not None:
        business_tags_similarity.append(cosine_similarity([data['b_embedding'][i]], labels['l_embedding'].tolist())[0])
    else:
        business_tags_similarity.append([0] * len(labels))

In [9]:
similarities = (np.array(description_similarity) + np.array(business_tags_similarity))

In [10]:
def get_top_labels(similarities, labels, top_n=3):
    top_indices = np.argsort(similarities, axis=1)[:, -top_n:][:, ::-1]
    top_labels_with_scores = [[(labels[i], similarities[row_idx, i]) for i in indices] for row_idx, indices in enumerate(top_indices)]
    return top_labels_with_scores

In [None]:
similarities_array = np.vstack(similarities[:data.shape[0]])


In [12]:
data['top_labels'] = get_top_labels(similarities_array, labels['label'].tolist(), top_n=3)

In [13]:
data[['description', 'business_tags', 'top_labels']].to_csv('../CSV/labelled_data_distilbert.csv', index=False)