In [48]:
# Load dictionaries for companies
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# Assuming the files are in the current directory
company_kw_vec = load_pickle_file('glanos-data/company.kw.vec')
# company_kw_vec_hr = load_pickle_file('glanos-data/company.kw.vec.hr')
company_short_definitions_kw_vec = load_pickle_file('glanos-data/company_short_definitions.kw.vec')
# company_short_definitions_kw_vec_hr = load_pickle_file('glanos-data/company_short_definitions.kw.vec.hr')

In [49]:
cn_descriptors_df = pd.read_csv('cn_descriptors_top.csv', sep='\t').drop(columns=['Unnamed: 6'])
cn_descriptors_df

Unnamed: 0,occurances,company,country,definition,additional definitions,keywords
0,1175619,Technavio,GB,technology research company,consultancy|technology research company,406:confident strategic decisions|403:healthca...
1,1111946,Cable News Network,US,,WarnerMedia company,1111917:warnermedia|13:warner media|6:states|6...
2,336949,Iiroc,CA,self-regulatory company,self-regulatory company,336866:investment dealers|330014:equity market...
3,242464,Rosen,US,law company,investor rights law company,241104:global investor rights|200641:purchaser...
4,196238,Schall,US,,shareholder rights litigation company,195736:national shareholder rights|179758:viol...
...,...,...,...,...,...,...
187934,50,Cdp North America,,research company,disclosure platform research company|investmen...,136:major corporations|136:financial markets|4...
187935,50,Money Carer Foundation,,social company,social company,33:own financial affairs|33:vulnerable adults|...
187936,50,Emerging Markets Private Equity,,trade company,capital company|trade company|trade private in...,40:emerging markets|36:changes|6:private inves...
187937,50,Bryght Ai,,intelligence company,conversational intelligence company|scoring pl...,2:research services


In [50]:
import torch
from tqdm import tqdm
tqdm.pandas()

sbert_model = SentenceTransformer('all-MiniLM-L12-v2', device='cuda' if torch.cuda.is_available() else 'cpu')
model = "glanos"

def aggregate_embeddings(keys, embeddings_dict):
    embeddings = []
    for key in keys:
        embedding = sbert_model.encode(key) if model == "sbert" else embeddings_dict[key]
        embeddings.append(embedding)
    return np.mean(embeddings, axis=0)


def get_embedding(row):
    company, definition, additional_definitions = row['company'], row['definition'], row['additional definitions']
    if pd.isna(additional_definitions) and pd.isna(definition):
        return np.nan
    if model != "sbert":
        additional_definitions = additional_definitions.replace(' ', '-').split('|')
    else:
        additional_definitions = additional_definitions.split('|')
    row['embedding_add_def'] = aggregate_embeddings(additional_definitions, company_kw_vec)
    if pd.isna(definition):
        row['embedding'] = row['embedding_add_def']
        return row
    if model != "sbert":
        definition = definition.replace(' ', '-')
    definition_weight = 0.6
    row['embedding_def'] = sbert_model.encode(definition) if model == "sbert" else np.array(company_kw_vec[definition])
    row['embedding'] = (1-definition_weight)*row['embedding_add_def']+definition_weight*row['embedding_def']
    return row


cn_descriptors_df = cn_descriptors_df.progress_apply(lambda row: get_embedding(row), axis=1)
cn_descriptors_df

100%|█| 187939/187939 [01:28<00:00, 2129.42it/s]


Unnamed: 0,additional definitions,company,country,definition,embedding,embedding_add_def,embedding_def,keywords,occurances
0,consultancy|technology research company,Technavio,GB,technology research company,"[0.5584306798875331, -0.5956031799316407, 0.03...","[1.408119176980108, -1.4246260821819305, 0.239...","[-0.008028318174183369, -0.042921245098114014,...",406:confident strategic decisions|403:healthca...,1175619.0
1,WarnerMedia company,Cable News Network,US,,"[0.14874106645584106, -0.11565917730331421, -0...","[0.14874106645584106, -0.11565917730331421, -0...",,1111917:warnermedia|13:warner media|6:states|6...,1111946.0
2,self-regulatory company,Iiroc,CA,self-regulatory company,"[-0.8277637884020805, 0.4841603636741638, 0.75...","[-0.8277637884020805, 0.4841603636741638, 0.75...","[-0.8277637884020805, 0.4841603636741638, 0.75...",336866:investment dealers|330014:equity market...,336949.0
3,investor rights law company,Rosen,US,law company,"[-0.1707757626970609, 0.5263834818421552, -1.1...","[-0.5855022271474203, 1.3137756983439128, -2.5...","[0.105708546936512, 0.0014553375076502562, -0....",241104:global investor rights|200641:purchaser...,242464.0
4,shareholder rights litigation company,Schall,US,,"[-2.242476304372152, 4.226632833480835, -2.286...","[-2.242476304372152, 4.226632833480835, -2.286...",,195736:national shareholder rights|179758:viol...,196238.0
...,...,...,...,...,...,...,...,...,...
187934,disclosure platform research company|investmen...,Cdp North America,,research company,"[2.5978962277372677, 0.040041192186375585, -0....","[-0.5069922568897406, 2.1112175777864954, 0.60...","[4.667821884155273, -1.340743064880371, -0.666...",136:major corporations|136:financial markets|4...,50.0
187935,social company,Money Carer Foundation,,social company,"[0.04136781767010689, -0.1316673457622528, -0....","[0.04136781767010689, -0.1316673457622528, -0....","[0.04136781767010689, -0.1316673457622528, -0....",33:own financial affairs|33:vulnerable adults|...,50.0
187936,capital company|trade company|trade private in...,Emerging Markets Private Equity,,trade company,"[-0.45865677379899555, -0.1878563361035453, -0...","[-0.8413029482795132, -0.23755003180768755, -0...","[-0.20355932414531708, -0.15472720563411713, -...",40:emerging markets|36:changes|6:private inves...,50.0
187937,conversational intelligence company|scoring pl...,Bryght Ai,,intelligence company,"[0.020933592319488492, -2.0467818196862937, 0....","[-0.6148187071084976, -1.023301704786718, 1.11...","[0.4447684586048126, -2.7291018962860107, -0.5...",2:research services,50.0


In [42]:
# cn_descriptors_df.to_csv('cn_descriptors_sbert.tsv', sep='\t', index=False)

In [51]:
company_embedding_dict = cn_descriptors_df.set_index('company')['embedding'].to_dict()
company_embedding_dict = {k: v for k, v in company_embedding_dict.items() if not pd.isna(k)}
company_embedding_dict = {k.lower(): v for k, v in company_embedding_dict.items()}

In [52]:
prefix = "glanos-data/"
with open(f'{prefix}company_embedding_dicts_glanos.pickle', 'wb') as f:
    pickle.dump(company_embedding_dict, f)