In [5]:
import importlib
import classification_training_utils
importlib.reload(classification_training_utils)
import utils
importlib.reload(utils)

import sys
import dimensionality_reduction
import json 
import numpy as np
from numpy import dot
import os
from numpy.linalg import norm
import pandas as pd
import pickle
import plotly.offline as pyo
import plotly.express as px
import random
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()

from utils import aggregate_embeddings, cosine_similarity, print_similarity_samples, load_model, country_code_map, split_into_list, replace_nan_with, collect_column_values, create_replace_no_tags_embeddings
from classification_training_utils import get_big_consulting_df, get_news_df, get_top_values, get_relevant_classifications

In [6]:
params = {}
params['USE_REPLACE_DATA'] = True
params['USE_ORIGINAL_DATA'] = False
params['ALLOW_PAIRS_OF_SAME_COMPANIES'] = True
params['PAIR_RANDOM_ROWS'] = False
params['READ_SAVED_DATASET'] = False

In [7]:
def merge_rows(df, params):
    df1 = df.iloc[::2].reset_index(drop=True)  # odd-indexed rows
    df2 = df.iloc[1::2].reset_index(drop=True)  # even-indexed rows
    df1.columns = [f"{col}1" for col in df1.columns]
    df2.columns = [f"{col}2" for col in df2.columns]
    merged_df = pd.DataFrame(columns=df1.columns.tolist() + df2.columns.tolist())

    def get_random_row(df, random_state):
        random.seed(random_state)
        sampled_row_index = random.choice(df2.index)
        sampled_row_df2 = df2.loc[sampled_row_index]
        return sampled_row_index, sampled_row_df2

    if not params['PAIR_RANDOM_ROWS']:
        merged_df = pd.concat([df1, df2], axis=1).reset_index(drop=True)
    else:
        for i in tqdm(range(0, len(df2))):
            row_df1 = df1.loc[i]
            random_state = 42
            row_df2_index, row_df2 = get_random_row(df2, random_state)

            count = 0
            if not params['ALLOW_PAIRS_OF_SAME_COMPANIES']:
                while row_df2['company2'] == row_df1['company1']:
                    random_state -= 1
                    row_df2_index, row_df2 = get_random_row(df2, random_state)
        #             print(sampled_row_df2['company2'])
                    count += 1
                    if count == 50:
                        break
            merged_row = pd.concat([row_df1, row_df2], axis=0)
            merged_df.loc[len(merged_df.index)] = merged_row
            df2 = df2.drop(row_df2_index)

    return merged_df

In [8]:
keyword_acronym_map = {'ai': 'artificial intelligence', 'llm':'large language models', 'nlp': 'natural language processing'}

def preprocess_df(df, params):
    df = df.drop_duplicates(subset=['replace'])
    
    def replace_keywords_with_acronyms(row, keyword_acronym_map):
        keywords = row['keywords']
        updated_keywords = set()
        for kw in keywords:
            updated_keywords.add(keyword_acronym_map[kw] if kw in keyword_acronym_map.keys() else kw)
        row['keywords'] = list(updated_keywords)
        return row

#     df = df.drop(columns=["index", "tooltip", "score", "id", "company", "relationEntity", "relationEntityType", "country", "similarity", "Unnamed: 4", "Unnamed: 5"])
    cols_with_lists = ['keywords', 'classification']
    df = split_into_list(df, cols_with_lists)
    df = df.apply(lambda row: replace_keywords_with_acronyms(row, keyword_acronym_map), axis=1)
    df['id'] = df.index
    df = df.sample(frac=1, random_state=42)
    df = merge_rows(df, params)
    return df

def merge_split_and_save_df(df, params, dataset_dir, val_test_size=100):
    # Preprocess and merge rows
    df = preprocess_df(df, params)
    # Split
    test_df = df[:val_test_size]
    val_df = df[val_test_size:val_test_size*2]
    train_df = df[val_test_size*2:]
    train_df.dropna(subset=['replace2'], inplace=True)
#     # Save - too big to save
#     os.makedirs(dataset_dir)
#     train_df.to_csv(f'{dataset_dir}train.tsv', sep='\t')
#     val_df.to_csv(f'{dataset_dir}val.tsv', sep='\t')
#     test_df.to_csv(f'{dataset_dir}test.tsv', sep='\t')
    return train_df, val_df, test_df

def read_split_df(dataset_dir):
    cols_with_lists = ['keywords1', 'classification1', 'keywords2', 'classification2']
    test_df = pd.read_csv(f'{dataset_dir}test.tsv', sep='\t', index_col=0)
    test_df = split_into_list(test_df, cols_with_lists)
    val_df = pd.read_csv(f'{dataset_dir}val.tsv', sep='\t', index_col=0)
    val_df = split_into_list(val_df, cols_with_lists)
    train_df = pd.read_csv(f'{dataset_dir}train.tsv', sep='\t', index_col=0)
    if 'snippet2.1' in train_df.columns:
        train_df = train_df.drop(columns=['snippet2.1'])
    train_df = split_into_list(train_df, cols_with_lists)
    return train_df, val_df, test_df

def get_split_df(params, df=None, dataset_dir='../dataset/'):
    train_df, val_df, test_df = read_split_df(dataset_dir) if os.path.exists(dataset_dir) else merge_split_and_save_df(df, params, dataset_dir, val_test_size=100)
    return train_df, val_df, test_df

# big_consulting_df = get_big_consulting_df(params)
# ai_news_df = get_news_df(params, 'ai_news')
# car_news_df = get_news_df(params, 'car_news')

# ai_news_dataset_dir = '../dataset/ai_news/'
# ai_news_train_df, ai_news_val_df, ai_news_test_df = get_split_df(params, df=ai_news_df, dataset_dir=ai_news_dataset_dir)

# car_news_dataset_dir = '../dataset/car_news/'
# car_news_train_df, car_news_val_df, car_news_test_df = get_split_df(params, df=car_news_df, dataset_dir=car_news_dataset_dir)

# big_consulting_dataset_dir = '../dataset/big_consulting/'
# big_consulting_train_df, big_consulting_val_df, big_consulting_test_df = get_split_df(params, df=big_consulting_df, dataset_dir=big_consulting_dataset_dir)

train_df = pd.concat([ai_news_train_df, car_news_train_df], axis=0).reset_index(drop=True)
val_df = pd.concat([ai_news_val_df, car_news_val_df], axis=0).reset_index(drop=True)
test_df = pd.concat([ai_news_test_df, car_news_test_df], axis=0).reset_index(drop=True)

params["COLUMNS"] = train_df.columns

NameError: name 'ai_news_train_df' is not defined

In [54]:

# from collections import Counter

# all_keywords = ai_news_df['keywords'].str.split('|').explode().tolist()
# word_count = Counter(all_keywords)

# # Print the word frequencies
# for word, count in word_count.items():
#     print(f'{word}: {count}')


train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
val_df['similarity'] = val_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)

train_df

100%|██████████████████████████████████████████████████████████| 219815/219815 [00:01<00:00, 113222.19it/s]
100%|█████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 74598.56it/s]
100%|█████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 78244.64it/s]


Unnamed: 0,replace1,snippet1,classification1,keywords1,embedding1,id1,replace2,snippet2,classification2,keywords2,embedding2,id2,similarity
0,"#PERSON, #JOBTITLE of #COMPANY, the holding co...","Sundar Pichai, CEO of Alphabet, the holding co...",[entity],[artificial intelligence],"[-0.06915243, 0.01455253, 0.024989886, -0.0257...",216067,The WisdomTree #COMPANY Value Fund ETF uses a ...,The WisdomTree International Al Enhanced Value...,[entity],[artificial intelligence],"[0.022610608, -0.12946172, -0.10784928, -0.035...",182225.0,0.303250
1,Media and Internet holding company #COMPANY is...,IAC + 1: Media and Internet holding company IA...,[other],[artificial intelligence],"[-0.002212706, -0.042485017, -0.08343706, -0.0...",7495,"#LOC’s AA-rated IT services group #COMPANY,",And Japan’s AA-rated IT services group NEC Cor...,[other],[artificial intelligence],"[-0.016172213, -0.011744792, -0.04392088, 0.00...",149398.0,0.381786
2,"over the next few years, since the new technol...",Gen-1 is driving the next supercycle of cloud ...,[employ],[artificial intelligence],"[-0.067335255, -0.00187942, 0.0024513635, -0.0...",49247,"#COMPANY , a fast-growing #LOC-based startup w...","Hugging Face , a fast-growing New York-based s...",[entity],[artificial intelligence],"[-0.0582173, -0.05877296, 0.0024392686, 0.0074...",90130.0,0.498639
3,#COMPANY launches Skills-based Talent planning...,Eightfold 1 Launches Skills-Based Talent Plann...,[entity],[artificial intelligence],"[-0.056243468, -0.0053915293, 0.011274071, 0.0...",231148,As ChatGPT – developed by #COMPANY is not avai...,As 1 – developed by Microsoft-backed 1 – is no...,[other],"[openai, artificial intelligence, chatgpt]","[-0.047819942, -0.0714631, 0.047958616, -0.007...",214641.0,0.278887
4,The #COMPANY is designed to simplify video pro...,The 1 Video Generator is designed to simplify ...,[entity],[artificial intelligence],"[-0.049500536, 0.021719, 0.010959194, -0.05274...",61653,"chatbots like ChatGPT and #COMPANY Bard, may b...","the used in chatbots like 1 and 1, may be maki...",[entity],"[google bard, chatgpt]","[-0.021107346, -0.07309405, 0.0432339, -0.0501...",46199.0,0.337115
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219810,Because #PERSON ’ s contract with #COMPANY inc...,Because Graham Ford ’ s contract with FCS inc...,[other],[ford],"[-0.08120356, 0.042070776, 0.094466455, 0.0682...",87539,#COMPANY and #COMPANY had agreed for a #ORG,2023 Toyota Motor Corporation and Suzuki Motor...,[agreement],[suzuki],"[-0.0480265, 0.023894105, 0.027636005, 0.00332...",168350.0,0.412295
219811,#COMPANY Get Free Report on #DATE,General Motors (GM) - Get Free Report on April...,[entity],[general motors],"[-0.059067983, 0.03613742, -0.06019407, 0.0033...",137416,The #COMPANY’s commitment to achieving full cl...,The BMW Group’s commitment to achieving full c...,[entity],[bmw],"[0.017645555, 0.110141344, 0.0147629045, 0.006...",54909.0,0.350029
219812,A #COMPANY expert will now assume a larger #JO...,A Volvo Bus expert will now assume a larger di...,[other],[volvo],"[-0.03856746, 0.023804769, 0.021807693, -0.019...",110326,Reports suggest #PERSON’s private plan landed ...,Reports suggest Musk’s private plan landed in ...,[other],[tesla],"[0.060868353, 0.018765314, -0.038085416, 0.013...",119948.0,0.505790
219813,Automotive manufacturer #COMPANY announced tod...,Automotive manufacturer Mercedes-Benz announce...,[market_loc],[mercedes],"[-0.08559719, 0.06645149, 0.031096734, 0.02838...",103746,"By #DATE, #COMPANY aims to have reduced the en...","By 2030, BMW aims to have reduced the entire c...",[entity],[bmw],"[0.01392833, 0.1668224, 0.042746324, 0.0332090...",132008.0,0.416531


In [57]:
if 'embedding1' in train_df.columns:
    train_df = train_df.drop(columns=['embedding1', 'embedding2'])
    val_df = val_df.drop(columns=['embedding1', 'embedding2'])
    test_df = test_df.drop(columns=['embedding1', 'embedding2'])
save_dir = '../similarity-training-data/replace/'
train_df.to_csv(f'{save_dir}train.tsv', sep='\t')
val_df.to_csv(f'{save_dir}val.tsv', sep='\t')
test_df.to_csv(f'{save_dir}test.tsv', sep='\t')

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')

def split_dataframe(df, value):
    df_1 = df.loc[:, df.columns.str.contains(value)]
    df_2 = df.loc[:, ~df.columns.str.contains(value)]
    return df_1, df_2

def remove_trailing_number(column):
    if column[-1] in ['1', '2']:
        return column[:-1]
    return column

df_1, df_2 = split_dataframe(train_df, '1')
# Remove trailing number from column names
df_1.columns = df_1.columns.map(remove_trailing_number)
df_2.columns = df_2.columns.map(remove_trailing_number)

# Append the two DataFrames together
combined_df = pd.concat([df_1, df_2], axis=0).reset_index()

def has_verb(sentence, to_print=False):
    tokens = word_tokenize(sentence)
    tags = pos_tag(tokens)
    
    for token, tag in zip(tokens,tags):
        if to_print:
            print(token, ':', tag)
        if tag[1].startswith('VB'):
            return True
    
    return False

def has_no_verb(sentence):
    return not has_verb(sentence)

def has_no_verb_2(sentence):
    return not has_verb(sentence, to_print=True)


# pd.set_option('display.max_colwidth', None)

# # short_snippets = combined_df[(combined_df['snippet'].apply(str).str.len() < 40)][['snippet']] #  | (combined_df['snippet'].apply(str.lower).apply(has_no_verb))
# # print(short_snippets)
# combined_df = combined_df[(combined_df['snippet'].apply(str).str.len() >= 40)] #  | (combined_df['snippet'].apply(str).apply(has_verb))
# len(combined_df)
combined_df

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bartekjezierski/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,index,snippet,tooltip,score,id,embedding,company,relationEntity,relationEntityType,classification,country,keywords,similarity,replace,Unnamed: 4,Unnamed: 5
0,0.0,at KPMG where he focused on complex financial ...,(2023-04-21) Strategy|Company Info\n\nat KPMG ...,1.0,ID0,[ 6.15465902e-02 8.24777856e-02 -5.93600161e-...,KPMG A/S,,,"[strategy, company info]",US,[he],,,,
1,1.0,"In his last role, Prashant was playing the rol...","(2023-04-21) \n\nIn his last role, Prashant wa...",0.8,ID2,[ 7.53979683e-02 -2.51883809e-02 -9.49191768e-...,Infosys Limited,Prashant Ramanujan,person,[],JO,"[head, leader, role, key leader, regional head]",,,,
2,2.0,Wipro consolidates presence in foods with acqu...,(2023-04-21) Market Share Growth\n\nWipro cons...,1.0,ID4,[ 8.27922896e-02 2.77788024e-02 -3.00570838e-...,Wipro Limited,,,[market share growth],IN,[food],,,,
3,3.0,messenger RNA (mRNA) therapeutics and vaccines...,(2023-04-21) Agreements\n\nmessenger RNA (mRNA...,1.0,ID6,[-3.14713307e-02 2.28908807e-02 1.27594464e-...,IBM Corp.,,,[agreements],IN,"[vaccine, agreement]",,,,
4,4.0,IBM's purpose is to be the catalyst,(2023-04-21) \n\nIBM's purpose is to be the ca...,0.6,ID8,[ 1.07019925e-02 9.42185596e-02 -1.92928538e-...,IBM Corp.,,,[],IN,[catalyst],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453342,,BMW’s premium hatchback will get a complete re...,,,,"[-0.04795007, 0.07876511, 0.040450905, 0.00488...",,,,ENTITY,,bmw,,#COMPANY’s premium hatchback will get a comple...,,
453343,,Volkswagen has announced a new limited edition...,,,,"[-0.044824775, 0.10501017, 0.14470142, 0.00046...",,,,ENTITY,,volkswagen,,#COMPANY has announced a new limited edition v...,,
453344,,"Volkswagen’s premium SUV, the Touareg, is now ...",,,,"[-0.0110210255, 0.06328671, -0.011485885, 0.03...",,,,ENTITY,,volkswagen,,"#COMPANY’s premium SUV, the Touareg, is now on...",,
453345,,"The VW Touareg is available in Elegance, Black...",,,,"[-0.088718735, -0.006033144, 0.00040222268, 0....",,,,ENTITY,,volkswagen,,"The #COMPANY Touareg is available in Elegance,...",,


In [9]:
# Create a dictionary from big_consulting_export_replace dataframe
replace_snippet_embedding_dict = big_consulting_export_replace.set_index('snippet')['embedding'].to_dict()

print(len(combined_df))

# Iterate through train_df and update the 'embedding' column
for index, row in combined_df.iterrows():
    snippet = row['snippet']
    if snippet in replace_snippet_embedding_dict:
        combined_df.at[index, 'embedding'] = replace_snippet_embedding_dict[snippet]
    else:
        # Drop the row if snippet is not in the dictionary
        combined_df = combined_df.drop(index)

# Reset the index of the updated train_df
combined_df = combined_df.reset_index(drop=True)

print(len(combined_df))


12916
505


In [9]:
from tqdm import tqdm

if params['READ_SAVED_DATASET']:
    train_df = merge_rows(combined_df, params)


    train_df['embedding2'] = train_df['embedding2'].apply(lambda x: 
                               np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' '))
    train_df['embedding1'] = train_df['embedding1'].apply(lambda x: 
                               np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' '))

    train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
    if 'company' in params['COLUMNS']:
        count = len(train_df[train_df['company1'] == train_df['company2']])
        print("Number of rows with the same value for company1 and company2:", count)
        
def cosine_similarity(a, b):
#     print(a)
    try:
        a = np.array(list(a), dtype=np.float32)
        b = np.array(list(b), dtype=np.float32)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    except ValueError as ve:
        print('ValueError')
        return 0.0


# merged_df['similarity'] = merged_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)


In [10]:
def convert_ndarray(x):
    if isinstance(x, np.ndarray):
        return 0.0
    else:
        return x

    
def find_company_name(words, company_list):
    for i in range(len(words), 0, -1):
        substring = ' '.join(words[:i])
        if substring in company_list:
            return substring
    return None


def get_company_names(company, relation_entity, relation_entity_type, company_list):
    if not pd.isna(company):
        possible_company_names = [company.lower()]
    else:
        possible_company_names = []
    company_names = []
    if relation_entity_type == 'company' and not pd.isna(relation_entity):
        possible_company_names.append(relation_entity.lower())
    for possible_company_name in possible_company_names:
        if possible_company_name in company_list:
            company_name = possible_company_name
            company_names.append(possible_company_name)
        else:
            possible_company_names = possible_company_name.split(' ')
            company_name = find_company_name(possible_company_names, company_list)
            if company_name is not None:
                company_names.append(company_name)
    return company_names


def company_similarity(row):
    company_list = list(company_embedding_dict.keys())
    companies1 = get_company_names(row['company1'], row['relationEntity1'], row['relationEntityType1'], company_list)
    companies2 = get_company_names(row['company2'], row['relationEntity2'], row['relationEntityType2'], company_list)
    
    embeddings1 = aggregate_embeddings(companies1, company_embedding_dict)
    embeddings2 = aggregate_embeddings(companies2, company_embedding_dict)
        
    if companies1 == [] or companies2 == []:
        similarity = row['similarity']
    else:
        similarity = cosine_similarity(embeddings1, embeddings2)
    row['company_similarity'], row['companies1'], row['companies2'] = convert_ndarray(similarity), companies1, companies2
    
    return row

def keyword_with_company_similarity(row, keyword_dict):
    '''
    If the keyword is a company name, use company embeddings instead of SBERT
    '''
    company_list = list(company_embedding_dict.keys())
    temp_dict_1 = {}
    temp_dict_2 = {}
    
    for kw in row['keywords1']: 
        if kw in company_list:
            temp_dict_1[kw] = company_embedding_dict[kw]
        elif kw in keyword_dict.keys():
            temp_dict_1[kw] = keyword_dict[kw]
                
    for kw in row['keywords2']: 
        if kw in company_list:
            temp_dict_2[kw] = company_embedding_dict[kw]
        elif kw in keyword_dict.keys():
            temp_dict_2[kw] = keyword_dict[kw]
            
    embeddings1 = aggregate_embeddings(temp_dict_1.keys(), temp_dict_1)
    embeddings2 = aggregate_embeddings(temp_dict_2.keys(), temp_dict_2)
        
    if list(temp_dict_1.keys()) == [] or list(temp_dict_2.keys()) == []:
        similarity = row['similarity']
    else:
        similarity = cosine_similarity(embeddings1, embeddings2)
    row['keyword_similarity'] = convert_ndarray(similarity)
    
    return row

def convert_ndarray(x):
    if isinstance(x, np.ndarray):
        return 0.0
    else:
        return x

def company_similarity_df(df):
    df = df.progress_apply(lambda row: company_similarity(row), axis=1)
    print_similarity_samples(df, 'company_similarity', sample_size=5)
    return df

def keyword_similarity_with_companies_df(df, keyword_dict, verbose=False):
    df = df.progress_apply(lambda row: keyword_with_company_similarity(row, keyword_dict), axis=1)
    if verbose:
        print_similarity_samples(df, 'keyword_similarity', sample_size=5)
    return df


def column_similarity(row, column_name, embedding_dict):
    '''
    If keywords or classification are empty, set the simialrity same as similarity of the whole sentence
    '''
    column1, column2 = row[f'{column_name}1'], row[f'{column_name}2']
    if type(column1) != list:
        column1 = [column1]
        column2 = [column2]
        if pd.isna(column1) or pd.isna(column2):
            row[f'{column_name}_similarity'] = row['similarity']
            return row
    else:
        if len(column1) == 0 or len(column2) == 0:
            row[f'{column_name}_similarity'] = row['similarity']
            return row
            
    to_lower = column_name != 'country'
    embeddings1 = aggregate_embeddings(column1, embedding_dict, to_lower=to_lower)
    embeddings2 = aggregate_embeddings(column2, embedding_dict, to_lower=to_lower)
    row[f'{column_name}_similarity'] = convert_ndarray(cosine_similarity(embeddings1, embeddings2))
    
    return row

In [11]:
# Load company embeddings
prefix = '../glanos-data/embeddings/'
# with open(f'{prefix}company_embedding_dicts_glanos_0.4main.pickle', 'rb') as f:
# with open(f'{prefix}company_embedding_dicts_glanos.pickle', 'rb') as f:
with open(f'{prefix}company_embedding_dicts_sbert.pickle', 'rb') as f:
    company_embedding_dict = pickle.load(f)
with open(f'{prefix}kw_embedding_dict.pickle', 'rb') as f:
    kw_embeddings = pickle.load(f)
# with open(f'{prefix}class_embedding_dict.pickle', 'rb') as f:
#     class_embeddings = pickle.load(f)
with open(f'{prefix}car_news_classification.pickle', 'rb') as f:
    classsification_embeddings = pickle.load(f)
    with open(f'{prefix}ai_news_classification.pickle', 'rb') as f:
        classsification_embeddings.update(pickle.load(f))

country_embeddings = country_code_map()


In [78]:
# test_df = company_similarity_df(test_df)
# val_df = company_similarity_df(val_df)
# train_df = company_similarity_df(train_df)

In [79]:
# test_df = test_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)
# val_df = val_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)
# train_df = train_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)

  1%|▋                                                                     | 1/100 [00:00<00:04, 22.23it/s]


KeyError: 'country1'

In [14]:
save_dir = '../similarity-training-data/replace/'
train_df = pd.read_csv(f'{save_dir}train.tsv', sep='\t')
val_df = pd.read_csv(f'{save_dir}val.tsv', sep='\t')
test_df = pd.read_csv(f'{save_dir}test.tsv', sep='\t')

train_df

Unnamed: 0.1,Unnamed: 0,replace1,snippet1,classification1,keywords1,id1,replace2,snippet2,classification2,keywords2,id2,similarity
0,0,"#PERSON, #JOBTITLE of #COMPANY, the holding co...","Sundar Pichai, CEO of Alphabet, the holding co...",['entity'],['artificial intelligence'],216067,The WisdomTree #COMPANY Value Fund ETF uses a ...,The WisdomTree International Al Enhanced Value...,['entity'],['artificial intelligence'],182225.0,0.303250
1,1,Media and Internet holding company #COMPANY is...,IAC + 1: Media and Internet holding company IA...,['other'],['artificial intelligence'],7495,"#LOC’s AA-rated IT services group #COMPANY,",And Japan’s AA-rated IT services group NEC Cor...,['other'],['artificial intelligence'],149398.0,0.381786
2,2,"over the next few years, since the new technol...",Gen-1 is driving the next supercycle of cloud ...,['employ'],['artificial intelligence'],49247,"#COMPANY , a fast-growing #LOC-based startup w...","Hugging Face , a fast-growing New York-based s...",['entity'],['artificial intelligence'],90130.0,0.498638
3,3,#COMPANY launches Skills-based Talent planning...,Eightfold 1 Launches Skills-Based Talent Plann...,['entity'],['artificial intelligence'],231148,As ChatGPT – developed by #COMPANY is not avai...,As 1 – developed by Microsoft-backed 1 – is no...,['other'],"['openai', 'artificial intelligence', 'chatgpt']",214641.0,0.278887
4,4,The #COMPANY is designed to simplify video pro...,The 1 Video Generator is designed to simplify ...,['entity'],['artificial intelligence'],61653,"chatbots like ChatGPT and #COMPANY Bard, may b...","the used in chatbots like 1 and 1, may be maki...",['entity'],"['google bard', 'chatgpt']",46199.0,0.337116
...,...,...,...,...,...,...,...,...,...,...,...,...
219810,219810,Because #PERSON ’ s contract with #COMPANY inc...,Because Graham Ford ’ s contract with FCS inc...,['other'],['ford'],87539,#COMPANY and #COMPANY had agreed for a #ORG,2023 Toyota Motor Corporation and Suzuki Motor...,['agreement'],['suzuki'],168350.0,0.412295
219811,219811,#COMPANY Get Free Report on #DATE,General Motors (GM) - Get Free Report on April...,['entity'],['general motors'],137416,The #COMPANY’s commitment to achieving full cl...,The BMW Group’s commitment to achieving full c...,['entity'],['bmw'],54909.0,0.350029
219812,219812,A #COMPANY expert will now assume a larger #JO...,A Volvo Bus expert will now assume a larger di...,['other'],['volvo'],110326,Reports suggest #PERSON’s private plan landed ...,Reports suggest Musk’s private plan landed in ...,['other'],['tesla'],119948.0,0.505790
219813,219813,Automotive manufacturer #COMPANY announced tod...,Automotive manufacturer Mercedes-Benz announce...,['market_loc'],['mercedes'],103746,"By #DATE, #COMPANY aims to have reduced the en...","By 2030, BMW aims to have reduced the entire c...",['entity'],['bmw'],132008.0,0.416531


In [13]:
# '''
# Prepare big dataset with classification and keyword similarities 
# '''
# from ast import literal_eval

# ai_news_df = get_news_df(params, 'ai_news')
# car_news_df = get_news_df(params, 'car_news')

# ai_news_dataset_dir = '../dataset/ai_news/'
# ai_news_train_df, ai_news_val_df, ai_news_test_df = get_split_df(params, df=ai_news_df, dataset_dir=ai_news_dataset_dir)

# car_news_dataset_dir = '../dataset/car_news/'
# car_news_train_df, car_news_val_df, car_news_test_df = get_split_df(params, df=car_news_df, dataset_dir=car_news_dataset_dir)

# train_df = pd.concat([ai_news_train_df, car_news_train_df], axis=0).reset_index(drop=True)
# val_df = pd.concat([ai_news_val_df, car_news_val_df], axis=0).reset_index(drop=True)
# test_df = pd.concat([ai_news_test_df, car_news_test_df], axis=0).reset_index(drop=True)

# params["COLUMNS"] = train_df.columns

# prefix = "../glanos-data/embeddings/"

# with open(f'{prefix}ai_news_replace_no_tags.pickle', 'rb') as f:
#     replace_no_tags_embeddings = pickle.load(f)
#     with open(f'{prefix}car_news_replace_no_tags.pickle', 'rb') as f:
#         replace_no_tags_embeddings.update(pickle.load(f))

# test_df = create_replace_no_tags_embeddings(test_df, replace_no_tags_embeddings)
# val_df = create_replace_no_tags_embeddings(val_df, replace_no_tags_embeddings)
# train_df = create_replace_no_tags_embeddings(train_df, replace_no_tags_embeddings)






# test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
# val_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
# train_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)

# def preprocess_column(df, column_name, words_to_remove=[]):
#     df[f'{column_name}1'] = df[f'{column_name}1'].apply(lambda x: x if type(x) == list else literal_eval(x))
#     df[f'{column_name}2'] = df[f'{column_name}2'].apply(lambda x: x if type(x) == list else literal_eval(x))
#     if len(tokens_to_remove) > 0:
#         for token_to_remove in tokens_to_remove:
#             df[f'{column_name}1'] = df[f'{column_name}1'].apply(lambda tokens: [token for token in tokens if token != token_to_remove] if token_to_remove in tokens else tokens)
#             df[f'{column_name}2'] = df[f'{column_name}2'].apply(lambda tokens: [token for token in tokens if token != token_to_remove] if token_to_remove in tokens else tokens)
#     return df

# test_df = preprocess_column(test_df, 'keywords', words_to_remove=['artificial intelligence'])
# val_df = preprocess_column(val_df, 'keywords', words_to_remove=['artificial intelligence'])
# train_df = preprocess_column(train_df, 'keywords', words_to_remove=['artificial intelligence'])

# test_df = keyword_similarity_with_companies_df(test_df, kw_embeddings)
# val_df = keyword_similarity_with_companies_df(val_df, kw_embeddings)
# train_df = keyword_similarity_with_companies_df(train_df, kw_embeddings)

# def preprocess_classification(df):
#     df['classification1'] = df['classification1'].apply(lambda x: [] if 'entity' in x or 'other' in x else x)
#     df['classification2'] = df['classification2'].apply(lambda x: [] if 'entity' in x or 'other' in x else x)
#     df['classification1'] = df['classification1'].apply(lambda x: x if type(x) == list else literal_eval(x))
#     df['classification2'] = df['classification2'].apply(lambda x: x if type(x) == list else literal_eval(x))
#     return df

# test_df = preprocess_column(test_df, 'classification', words_to_remove=['entity', 'other'])
# val_df = preprocess_column(val_df, 'classification', words_to_remove=['entity', 'other'])
# train_df = preprocess_column(train_df, 'classification', words_to_remove=['entity', 'other'])

# test_df = test_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)
# val_df = val_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)
# train_df = train_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)

test_df = company_similarity_df(test_df)
val_df = company_similarity_df(val_df)
train_df = company_similarity_df(train_df)


  0%|        | 1/200 [00:00<00:09, 21.08it/s]


KeyError: 'company1'

In [16]:
test_df = keyword_similarity_with_companies_df(test_df, kw_embeddings)
val_df = keyword_similarity_with_companies_df(val_df, kw_embeddings)
train_df = keyword_similarity_with_companies_df(train_df, kw_embeddings)

100%|███████| 200/200 [00:22<00:00,  8.89it/s]


Ford dealers will replace the steering wheel clock spring, free of charge,
Fiat has marked the announcement by dunking a grey-painted version of its new 600e into a vat of orange paint.
0.6619002819061279

---

Executives from both sides signed an agreement in Seoul on Friday to construct a 50-50 EV battery cell joint venture in Savannah, Ga., where Hyundai Motor Group will build a new EV factory, the South Korean auto conglomerate said in an emailed statement.
The collaboration began in 2022 when Seat Unique secured an exclusive four-year partnership as the club’s Official Online Premium Ticketing and Hospitality Sales Platform.
0.6553218364715576

---

 BMW is once again a protagonist at the Cannes Film Festival with a fleet of over two hundred electrified cars.
Thankfully some things don’t change at Volvo and the seats are profoundly comfortable, supportive and in some versions covered in the loveliest wool-blend fabr
0.6540169715881348

---

Like American Water, WillScot Mobile Min

100%|███████| 200/200 [00:23<00:00,  8.67it/s]


The plant, which turned 50 this year, is also where BMW builds the 7-Series, 8-Series, and iX.
Laura Pallasch has been working as a Senior Account Manager at the T-Systems VW Account House for a year
0.15535695850849152

---

Total Volvo Trucks & Buses sales in May stood at 213 units, a steep growth of 60.2% from 133 units, YoY.
Audi trainees gave a famous classic automobile an electrified makeo
0.6827610731124878

---

Starting with the basics, TSLA closed at $293.34 on the previous day, July 18, 2023.
 “PMPs are a perfect convenience store offering, and the new Maryland Choc Chips Mini PMP offers a great opportunity for driving growth in the ‘sweet biscuit’ category,” Taylor added.
0.6513145565986633

---

 Com by Alastair McLeod called the Bell tolls for Fiat.
Exactly two years ago, down to a day, BMW Motorrad introduced us to the CE 04, an electrifying, and frankly revolutionary proposition for city travel.
0.6761146783828735

---

 "Kia's and Hyundai's failure to install standard 


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars


Mean of empty slice.


invalid value encountered in double_scalars

100%|█| 219815/219815 [12:02:46<00:00,  5.07it

Dinger’s Mini Golf Clay Frost, a Meriden resident and former documentary filmmaker, had visited Cote’s range and shop for years without even knowing there had been a mini golf course on the property.
In laying hens, small intestine morphology characteristics such as the VH, VW, CD, and VH/CD ratio are used to assess the health and nutrient absorptive cap
0.09521779417991638

---

Gopal Vittal, Chief Executive of Bharti Airtel explains how 1 and 1 tools are disrupting the sector
PRODUCT
0.0

---

Make MINI Car Model MINI Cooper Car Model Variant MINI Cooper John Cooper Works 3-Door P3,980,000.00</p><p>Pr
Rapid Cabriolet Is The VW Group's First Made-In-India 'Student Car
0.09521779417991638

---

The MINI Cooper SE had 96 sales, also selling more in 2022 than all BEVs combined in 2020
VW Group will first concentrate on its Software 1.2 platform, which will be rolled out starting next year in premium all-electric
0.09521779417991638

---

Besides the incident that leaked users’ conversati




In [11]:
def print_df(df):
    for index, row in df.iterrows():
        print(row['snippet1'])
        print(row['snippet2'])
        print("\n---\n")  # print a line for separation
        
def filter_cols(df):
    desired_columns = ['snippet1', 'snippet2']
    for col in df.columns:
        if 'similarity' in col:
            desired_columns.append(col)
            if '_' in col:
                desired_columns.append(col.split('_')[0]+'1')
                desired_columns.append(col.split('_')[0]+'2')
    if 'label' in df.columns:
        desired_columns.append('label')
    desired_columns = [col for col in desired_columns if col in df.columns]
    df = df.loc[:, desired_columns]
    return df

In [14]:
prefix = '../similarity-training-data/sbert-company-filtered/'
filter_cols(test_df).to_csv(f'{prefix}test.tsv', sep='\t')
filter_cols(val_df).to_csv(f'{prefix}val.tsv', sep='\t')
filter_cols(train_df).to_csv(f'{prefix}train.tsv', sep='\t')

In [17]:
# (-0.001, 0.1]     453
# (0.1, 0.2]       1140
# (0.2, 0.3]       1119
# (0.3, 0.4]        827
# (0.4, 0.5]        821
# (0.5, 0.6]        855
# (0.6, 0.7]        705
# (0.7, 0.8]        423
# (0.8, 0.9]        182
# (0.9, 1.0]        181 
# 1 - basically the same, the only differences are a few extra words in front or after (length check)


# 5 The two sentences are completely equivalent, as they mean the same thing. (i.e. talk about the same company, people, event, values)
# 4 The two sentences are mostly equivalent, but some unimportant details differ. (i.e. same company, people, event but different wording or different values)
# 3 The two sentences are roughly equivalent, but some important information differs/missing. (e.g. same event but different company or same company and similar event)
# 2 The two sentences are not equivalent, but share some details. (e.g. same company but different event)
# 1 The two sentences are not equivalent, but are on the same topic.
# 0 The two sentences are completely dissimilar.

# TODO - slightly modify giving 1 when one is a substring of another

In [65]:
train_df

Unnamed: 0,replace1,snippet1,classification1,keywords1,embedding1,id1,replace2,snippet2,classification2,keywords2,embedding2,id2,replace_no_tags1,replace_no_tags2,similarity,keyword_similarity,classification_similarity
0,"#PERSON, #JOBTITLE of #COMPANY, the holding co...","Sundar Pichai, CEO of Alphabet, the holding co...",[],[artificial intelligence],"[-0.06915243, 0.01455253, 0.024989886, -0.0257...",216067,The WisdomTree #COMPANY Value Fund ETF uses a ...,The WisdomTree International Al Enhanced Value...,[],[artificial intelligence],"[0.022610608, -0.12946172, -0.10784928, -0.035...",182225.0,of the holding company of recently stated in a...,The WisdomTree Value Fund ETF uses a proprieta...,0.564688,1.000000,0.564688
1,Media and Internet holding company #COMPANY is...,IAC + 1: Media and Internet holding company IA...,[],[artificial intelligence],"[-0.002212706, -0.042485017, -0.08343706, -0.0...",7495,"#LOC’s AA-rated IT services group #COMPANY,",And Japan’s AA-rated IT services group NEC Cor...,[],[artificial intelligence],"[-0.016172213, -0.011744792, -0.04392088, 0.00...",149398.0,Media and Internet holding company is implemen...,AA-rated IT services group,0.167768,1.000000,0.167768
2,"over the next few years, since the new technol...",Gen-1 is driving the next supercycle of cloud ...,[employ],[artificial intelligence],"[-0.067335255, -0.00187942, 0.0024513635, -0.0...",49247,"#COMPANY , a fast-growing #LOC-based startup w...","Hugging Face , a fast-growing New York-based s...",[],[artificial intelligence],"[-0.0582173, -0.05877296, 0.0024392686, 0.0074...",90130.0,"over the next few years, since the new technol...",", a fast-growing startup which enjoyed a valua...",0.252154,1.000000,0.252154
3,#COMPANY launches Skills-based Talent planning...,Eightfold 1 Launches Skills-Based Talent Plann...,[],[artificial intelligence],"[-0.056243468, -0.0053915293, 0.011274071, 0.0...",231148,As ChatGPT – developed by #COMPANY is not avai...,As 1 – developed by Microsoft-backed 1 – is no...,[],"[chatgpt, artificial intelligence, openai]","[-0.047819942, -0.0714631, 0.047958616, -0.007...",214641.0,"launches Skills-based Talent planning, empower...",As ChatGPT – developed by is not available for...,0.002902,0.729112,0.002902
4,The #COMPANY is designed to simplify video pro...,The 1 Video Generator is designed to simplify ...,[],[artificial intelligence],"[-0.049500536, 0.021719, 0.010959194, -0.05274...",61653,"chatbots like ChatGPT and #COMPANY Bard, may b...","the used in chatbots like 1 and 1, may be maki...",[],"[chatgpt, google bard]","[-0.021107346, -0.07309405, 0.0432339, -0.0501...",46199.0,The is designed to simplify video production a...,"chatbots like ChatGPT and Bard, may be making ...",0.122614,0.198170,0.122614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219810,Because #PERSON ’ s contract with #COMPANY inc...,Because Graham Ford ’ s contract with FCS inc...,[],[ford],"[-0.08120356, 0.042070776, 0.094466455, 0.0682...",87539,#COMPANY and #COMPANY had agreed for a #ORG,2023 Toyota Motor Corporation and Suzuki Motor...,[agreement],[suzuki],"[-0.0480265, 0.023894105, 0.027636005, 0.00332...",168350.0,Because ’ s contract with included both conven...,and had agreed for a,,0.709216,
219811,#COMPANY Get Free Report on #DATE,General Motors (GM) - Get Free Report on April...,[],[general motors],"[-0.059067983, 0.03613742, -0.06019407, 0.0033...",137416,The #COMPANY’s commitment to achieving full cl...,The BMW Group’s commitment to achieving full c...,[],[bmw],"[0.017645555, 0.110141344, 0.0147629045, 0.006...",54909.0,Get Free Report on,The commitment to achieving full climate neutr...,,0.983244,
219812,A #COMPANY expert will now assume a larger #JO...,A Volvo Bus expert will now assume a larger di...,[],[volvo],"[-0.03856746, 0.023804769, 0.021807693, -0.019...",110326,Reports suggest #PERSON’s private plan landed ...,Reports suggest Musk’s private plan landed in ...,[],[tesla],"[0.060868353, 0.018765314, -0.038085416, 0.013...",119948.0,A expert will now assume a larger role with later,Reports suggest private plan landed in the Chi...,,0.953276,
219813,Automotive manufacturer #COMPANY announced tod...,Automotive manufacturer Mercedes-Benz announce...,[market_loc],[mercedes],"[-0.08559719, 0.06645149, 0.031096734, 0.02838...",103746,"By #DATE, #COMPANY aims to have reduced the en...","By 2030, BMW aims to have reduced the entire c...",[],[bmw],"[0.01392833, 0.1668224, 0.042746324, 0.0332090...",132008.0,Automotive manufacturer announced today new ag...,By aims to have reduced the entire carbon foot...,,0.954971,
