In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

In [5]:
model_path = '/Users/apple/Documents/Priyesh/Pretrained-Models/all-mpnet-base-v2'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)
model.eval()

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [8]:
def get_embedding(text, tokenizer, model):
    """
    Generate embedding for a single text input.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embedding

In [10]:
reference_data = pd.read_csv('./datasets/augmented_incident_data.csv')
reference_data

Unnamed: 0,incident_text
0,"User's VDI session has become unresponsive, an..."
1,User is requesting help to terminate an unresp...
2,"User reports that their browser, Google Chrome..."
3,User's external monitor connected via the dock...
4,User reports a Google issue on Safari on macOS...
5,User is experiencing an issue with their macOS...
6,A user is unable to log into their macOS devic...
7,A user reports their VDI session has become co...
8,A user is encountering a 'no writable volume' ...
9,User reports that after resetting their passwo...


In [11]:
reference_data['embeddings'] = reference_data['incident_text'].apply(get_embedding, 
                                                                         tokenizer=tokenizer, 
                                                                         model=model)
reference_data

Unnamed: 0,incident_text,embeddings
0,"User's VDI session has become unresponsive, an...","[-0.059252072, -0.2790128, 0.075180456, -0.000..."
1,User is requesting help to terminate an unresp...,"[-0.007532268, -0.29740474, 0.10553131, 0.0040..."
2,"User reports that their browser, Google Chrome...","[-0.03658011, 0.013987021, 0.0051338603, -0.07..."
3,User's external monitor connected via the dock...,"[0.018151335, -0.2390149, -0.0037846114, -0.10..."
4,User reports a Google issue on Safari on macOS...,"[0.11392706, 0.03241244, 0.013849311, -0.03235..."
5,User is experiencing an issue with their macOS...,"[-0.004155284, -0.18536225, -0.0073767602, -0...."
6,A user is unable to log into their macOS devic...,"[-0.048428815, -0.12064064, 0.035280805, -0.13..."
7,A user reports their VDI session has become co...,"[-0.10446989, -0.29269668, 0.07327446, -0.0091..."
8,A user is encountering a 'no writable volume' ...,"[0.08880931, 0.0034038313, 0.07514673, -0.1042..."
9,User reports that after resetting their passwo...,"[0.011912092, -0.06828265, 0.001154553, -0.011..."


In [14]:
embeddings = reference_data['embeddings'].to_list()
embeddings

[array([-5.92520721e-02, -2.79012799e-01,  7.51804560e-02, -3.73130606e-04,
         5.56291193e-02,  3.34991775e-02, -4.41120230e-02,  6.21415712e-02,
        -6.37820885e-02,  1.61965434e-02, -7.15186670e-02,  1.37459943e-02,
        -4.92231622e-02, -1.01025686e-01, -1.45735174e-01, -8.14500451e-02,
        -6.25173971e-02, -4.96006869e-02, -1.02393351e-01, -4.60491842e-03,
         1.40802875e-01,  2.74712760e-02, -4.89227138e-02,  2.67334133e-01,
        -1.08365752e-02,  3.64459492e-02,  1.31617621e-01, -3.22450660e-02,
         2.21423842e-02,  1.03575125e-01,  1.86395831e-02,  2.37057917e-02,
        -9.53625813e-02, -1.76020395e-02,  4.06104073e-06,  1.29554287e-01,
         1.10680662e-01,  1.22104608e-01, -1.33222034e-02,  1.54063702e-01,
        -4.37358804e-02, -4.06119302e-02,  4.57593892e-03, -1.48796774e-02,
        -5.97807206e-02, -1.56794250e-01,  5.81792258e-02,  1.15329949e-02,
         9.40622017e-02, -4.97972183e-02, -2.34955195e-02,  9.31907296e-02,
         1.0

In [15]:
embeddings_df = pd.DataFrame(embeddings)
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.059252,-0.279013,0.07518,-0.000373,0.055629,0.033499,-0.044112,0.062142,-0.063782,0.016197,...,-0.053832,0.052499,0.177803,-0.022856,0.0427,-0.017478,0.164186,0.033388,0.050657,0.146379
1,-0.007532,-0.297405,0.105531,0.004084,0.000323,0.019611,0.063162,-0.069703,-0.069789,-0.036812,...,-0.004625,0.114148,0.154704,-0.031659,0.027644,-0.032609,0.131081,-0.026677,0.087517,0.131096
2,-0.03658,0.013987,0.005134,-0.074834,0.035548,-0.002209,0.073642,0.036667,0.153281,-0.189148,...,0.085165,-0.125823,-0.018907,-0.026068,-0.070231,-0.1144,-0.039524,0.046524,-0.005687,0.123416
3,0.018151,-0.239015,-0.003785,-0.108068,0.087052,-0.061889,-0.030857,0.109313,-3.3e-05,-0.009569,...,-0.00794,-0.036364,0.0355,0.075341,-0.158183,-0.05525,-0.005231,0.010982,-0.038457,-0.030223
4,0.113927,0.032412,0.013849,-0.03236,-0.005377,-0.002299,0.154173,0.086788,0.066329,-0.111388,...,-0.014064,-0.009165,0.048901,0.029025,-0.007329,-0.119896,-0.10025,-0.003131,0.046407,0.086626
5,-0.004155,-0.185362,-0.007377,-0.106579,-0.01037,0.001617,-0.008806,0.135518,-0.017343,-0.074144,...,-0.06301,-0.01587,0.016011,0.041255,-0.12557,-0.038138,0.036522,0.006232,-0.090696,-0.041563
6,-0.048429,-0.120641,0.035281,-0.132428,-0.005946,-0.008156,0.017249,-0.079992,-0.040859,-0.039111,...,-0.104552,-0.15567,0.138165,-0.001775,-0.054584,-0.079338,0.134726,-0.013171,0.004735,0.080551
7,-0.10447,-0.292697,0.073274,-0.009176,0.039959,-0.014601,-0.023192,0.002253,-0.090588,-0.005003,...,-0.030538,0.049849,0.195242,-0.030649,0.015784,-0.057407,0.120863,-0.007596,0.081558,0.096304
8,0.088809,0.003404,0.075147,-0.104244,-0.035137,0.005064,0.017443,0.080627,-0.1254,-0.096691,...,-0.045811,0.030481,0.113729,0.038502,-0.108601,-0.001677,0.011144,-0.025241,0.115787,0.070103
9,0.011912,-0.068283,0.001155,-0.011395,-0.003807,-0.061979,-0.091799,0.001125,0.064057,-0.055079,...,-0.093476,-0.185227,0.152223,0.014739,-0.053811,-0.049843,0.123491,-0.072434,0.009443,0.064802


In [16]:
embeddings_df.columns = ['col_' + str(x) for x in embeddings_df.columns]
embeddings_df

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_758,col_759,col_760,col_761,col_762,col_763,col_764,col_765,col_766,col_767
0,-0.059252,-0.279013,0.07518,-0.000373,0.055629,0.033499,-0.044112,0.062142,-0.063782,0.016197,...,-0.053832,0.052499,0.177803,-0.022856,0.0427,-0.017478,0.164186,0.033388,0.050657,0.146379
1,-0.007532,-0.297405,0.105531,0.004084,0.000323,0.019611,0.063162,-0.069703,-0.069789,-0.036812,...,-0.004625,0.114148,0.154704,-0.031659,0.027644,-0.032609,0.131081,-0.026677,0.087517,0.131096
2,-0.03658,0.013987,0.005134,-0.074834,0.035548,-0.002209,0.073642,0.036667,0.153281,-0.189148,...,0.085165,-0.125823,-0.018907,-0.026068,-0.070231,-0.1144,-0.039524,0.046524,-0.005687,0.123416
3,0.018151,-0.239015,-0.003785,-0.108068,0.087052,-0.061889,-0.030857,0.109313,-3.3e-05,-0.009569,...,-0.00794,-0.036364,0.0355,0.075341,-0.158183,-0.05525,-0.005231,0.010982,-0.038457,-0.030223
4,0.113927,0.032412,0.013849,-0.03236,-0.005377,-0.002299,0.154173,0.086788,0.066329,-0.111388,...,-0.014064,-0.009165,0.048901,0.029025,-0.007329,-0.119896,-0.10025,-0.003131,0.046407,0.086626
5,-0.004155,-0.185362,-0.007377,-0.106579,-0.01037,0.001617,-0.008806,0.135518,-0.017343,-0.074144,...,-0.06301,-0.01587,0.016011,0.041255,-0.12557,-0.038138,0.036522,0.006232,-0.090696,-0.041563
6,-0.048429,-0.120641,0.035281,-0.132428,-0.005946,-0.008156,0.017249,-0.079992,-0.040859,-0.039111,...,-0.104552,-0.15567,0.138165,-0.001775,-0.054584,-0.079338,0.134726,-0.013171,0.004735,0.080551
7,-0.10447,-0.292697,0.073274,-0.009176,0.039959,-0.014601,-0.023192,0.002253,-0.090588,-0.005003,...,-0.030538,0.049849,0.195242,-0.030649,0.015784,-0.057407,0.120863,-0.007596,0.081558,0.096304
8,0.088809,0.003404,0.075147,-0.104244,-0.035137,0.005064,0.017443,0.080627,-0.1254,-0.096691,...,-0.045811,0.030481,0.113729,0.038502,-0.108601,-0.001677,0.011144,-0.025241,0.115787,0.070103
9,0.011912,-0.068283,0.001155,-0.011395,-0.003807,-0.061979,-0.091799,0.001125,0.064057,-0.055079,...,-0.093476,-0.185227,0.152223,0.014739,-0.053811,-0.049843,0.123491,-0.072434,0.009443,0.064802
