In [1]:
import importlib
import classification_training_utils
importlib.reload(classification_training_utils)
import utils
importlib.reload(utils)

import sys
import dimensionality_reduction
import json 
import numpy as np
from numpy import dot
import os
from numpy.linalg import norm
import pandas as pd
import pickle
import plotly.offline as pyo
import plotly.express as px
import random
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()
from ast import literal_eval

from utils import aggregate_embeddings, cosine_similarity, print_similarity_samples, load_model, country_code_map, split_into_list, replace_nan_with, collect_column_values, create_replace_no_tags_embeddings
from classification_training_utils import get_big_consulting_df, get_news_df, get_top_values, get_relevant_classifications

In [3]:
params = {}
params['USE_REPLACE_DATA'] = True
params['USE_ORIGINAL_DATA'] = False
params['ALLOW_PAIRS_OF_SAME_COMPANIES'] = True
params['PAIR_RANDOM_ROWS'] = False
params['READ_SAVED_DATASET'] = False
params['CREATE_NEW_SPLIT'] = False
params["DATASETS"] = ["ai", "car"] # ["ai", "car", "consulting", "consulting2"]
params["SIMILARITY"] = ["company"]
params["OUTPUT_DIR"] = "../similarity-training-data/consulting"
params["COMPANY_EMBEDDINGS"] = 'company_embedding_dicts_sbert.pickle'


In [4]:
def merge_rows(df, params):
    df1 = df.iloc[::2].reset_index(drop=True)  # odd-indexed rows
    df2 = df.iloc[1::2].reset_index(drop=True)  # even-indexed rows
    df1.columns = [f"{col}1" for col in df1.columns]
    df2.columns = [f"{col}2" for col in df2.columns]
    merged_df = pd.DataFrame(columns=df1.columns.tolist() + df2.columns.tolist())

    def get_random_row(df, random_state):
        random.seed(random_state)
        sampled_row_index = random.choice(df2.index)
        sampled_row_df2 = df2.loc[sampled_row_index]
        return sampled_row_index, sampled_row_df2

    if not params['PAIR_RANDOM_ROWS']:
        merged_df = pd.concat([df1, df2], axis=1).reset_index(drop=True)
    else:
        for i in tqdm(range(0, len(df2))):
            row_df1 = df1.loc[i]
            random_state = 42
            row_df2_index, row_df2 = get_random_row(df2, random_state)

            count = 0
            if not params['ALLOW_PAIRS_OF_SAME_COMPANIES']:
                while row_df2['company2'] == row_df1['company1']:
                    random_state -= 1
                    row_df2_index, row_df2 = get_random_row(df2, random_state)
        #             print(sampled_row_df2['company2'])
                    count += 1
                    if count == 50:
                        break
            merged_row = pd.concat([row_df1, row_df2], axis=0)
            merged_df.loc[len(merged_df.index)] = merged_row
            df2 = df2.drop(row_df2_index)

    return merged_df

In [5]:
keyword_acronym_map = {'ai': 'artificial intelligence', 'llm':'large language models', 'nlp': 'natural language processing'}

def preprocess_df(df, params):
    df = df.drop_duplicates(subset=['replace'])
    
    def replace_keywords_with_acronyms(row, keyword_acronym_map):
        keywords = row['keywords']
        updated_keywords = set()
        for kw in keywords:
            updated_keywords.add(keyword_acronym_map[kw] if kw in keyword_acronym_map.keys() else kw)
        row['keywords'] = list(updated_keywords)
        return row

#     df = df.drop(columns=["index", "tooltip", "score", "id", "company", "relationEntity", "relationEntityType", "country", "similarity", "Unnamed: 4", "Unnamed: 5"])
    cols_with_lists = ['keywords', 'classification']
    df = split_into_list(df, cols_with_lists)
    df = df.apply(lambda row: replace_keywords_with_acronyms(row, keyword_acronym_map), axis=1)
    df['id'] = df.index
    df = df.sample(frac=1, random_state=42)
    df = merge_rows(df, params)
    return df

def merge_split_and_save_df(df, params, dataset_dir, val_test_size=100):
    # Preprocess and merge rows
    df = preprocess_df(df, params)
    # Split
    test_df = df[:val_test_size]
    val_df = df[val_test_size:val_test_size*2]
    train_df = df[val_test_size*2:]
    train_df.dropna(subset=['replace2'], inplace=True)
#     # Save - too big to save
#     os.makedirs(dataset_dir)
#     train_df.to_csv(f'{dataset_dir}train.tsv', sep='\t')
#     val_df.to_csv(f'{dataset_dir}val.tsv', sep='\t')
#     test_df.to_csv(f'{dataset_dir}test.tsv', sep='\t')
    return train_df, val_df, test_df

def read_split_df(dataset_dir):
    cols_with_lists = ['keywords1', 'classification1', 'keywords2', 'classification2']
    test_df = pd.read_csv(f'{dataset_dir}test.tsv', sep='\t', index_col=0)
    test_df = split_into_list(test_df, cols_with_lists)
    val_df = pd.read_csv(f'{dataset_dir}val.tsv', sep='\t', index_col=0)
    val_df = split_into_list(val_df, cols_with_lists)
    train_df = pd.read_csv(f'{dataset_dir}train.tsv', sep='\t', index_col=0)
    if 'snippet2.1' in train_df.columns:
        train_df = train_df.drop(columns=['snippet2.1'])
    train_df = split_into_list(train_df, cols_with_lists)
    return train_df, val_df, test_df

def get_split_df(params, df=None, dataset_dir='../dataset/'):
    train_df, val_df, test_df = read_split_df(dataset_dir) if os.path.exists(dataset_dir) else merge_split_and_save_df(df, params, dataset_dir, val_test_size=100)
    return train_df, val_df, test_df

def save_dataset(train_df, val_df, test_df, params):
    if not os.path.exists(params["OUTPUT_DIR"]):
        os.makedirs(params["OUTPUT_DIR"])
    # drop embeddings because they take up a lot of space
    if 'embedding1' in train_df.columns:
        train_df = train_df.drop(columns=['embedding1', 'embedding2'])
        val_df = val_df.drop(columns=['embedding1', 'embedding2'])
        test_df = test_df.drop(columns=['embedding1', 'embedding2'])
    train_df.to_csv(f'{params["OUTPUT_DIR"]}train.tsv', sep='\t')
    val_df.to_csv(f'{params["OUTPUT_DIR"]}val.tsv', sep='\t')
    test_df.to_csv(f'{params["OUTPUT_DIR"]}test.tsv', sep='\t')

def read_dataset():
    if 'company' in params["SIMILARITY"] and "ai" in params["DATASETS"] and "car" in params["DATASETS"]:
        read_dir = '../similarity-training-data/ai_car/sbert-company-0.0/'
        test_df = pd.read_csv(f'{read_dir}test.tsv', sep='\t', index_col=0)
        val_df = pd.read_csv(f'{read_dir}val.tsv', sep='\t', index_col=0)
        train_df = pd.read_csv(f'{read_dir}train.tsv', sep='\t', index_col=0)
        if not "company_similarity" in train_df.columns:
            car_dir = '../glanos-data/datasets/car_news_w_companies.tsv'
            ai_dir = '../glanos-data/datasets/ai_news_w_companies.tsv'
            car_df_w_comp = pd.read_csv(car_dir, sep='\t')
            ai_df_w_comp = pd.read_csv(car_dir, sep='\t')
            df_w_comp = pd.concat([car_df_w_comp, ai_df_w_comp], axis=0).reset_index(drop=True).drop(columns=['replace', 'classification', 'keywords'])

            df_w_comp = df_w_comp.drop_duplicates(subset='snippet', keep='first')
            company_dict = df_w_comp.set_index('snippet').to_dict()['company']
            relationEntity_dict = df_w_comp.set_index('snippet').to_dict()['relationEntity']
            relationEntityType_dict = df_w_comp.set_index('snippet').to_dict()['relationEntityType']

            def update_df_with_company(df):
                for index, row in df.iterrows():
                    current_snippet = df.at[index, 'snippet1']
                    df.at[index, 'company1'] = company_dict[current_snippet] if current_snippet in company_dict else ''
                    df.at[index, 'relationEntity1'] = relationEntity_dict[current_snippet] if current_snippet in relationEntity_dict else ''
                    df.at[index, 'relationEntityType1'] = relationEntityType_dict[current_snippet] if current_snippet in relationEntityType_dict else ''
                    current_snippet = df.at[index, 'snippet2']
                    df.at[index, 'company2'] = company_dict[current_snippet] if current_snippet in company_dict else ''
                    df.at[index, 'relationEntity2'] = relationEntity_dict[current_snippet] if current_snippet in relationEntity_dict else ''
                    df.at[index, 'relationEntityType2'] = relationEntityType_dict[current_snippet] if current_snippet in relationEntityType_dict else ''
                return df

            train_df = update_df_with_company(train_df)
            print('train_df', len(train_df))
            val_df = update_df_with_company(val_df)
            print('val_df', len(val_df))
            test_df = update_df_with_company(test_df)
            print('test_df', len(test_df))
            print('Updated with companies')
    else:
        if "ai" in params["DATASETS"] and "car" in params["DATASETS"]:
            read_dir = '../similarity-training-data/replace/'
        else: 
            read_dir = '../similarity-training-data/consulting/'
        test_df = pd.read_csv(f'{read_dir}test.tsv', sep='\t', index_col=0)
        val_df = pd.read_csv(f'{read_dir}val.tsv', sep='\t', index_col=0)
        train_df = pd.read_csv(f'{read_dir}train.tsv', sep='\t', index_col=0)

    return train_df, val_df, test_df

In [10]:
def cosine_similarity(a, b):
    try:
        a = np.array(list(a), dtype=np.float32)
        b = np.array(list(b), dtype=np.float32)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    except ValueError as ve:
        print('ValueError')
        return 0.0

def convert_ndarray(x):
    if isinstance(x, np.ndarray):
        return 0.0
    else:
        return x

    
def find_company_name(words, company_list):
    for i in range(len(words), 0, -1):
        substring = ' '.join(words[:i])
        if substring in company_list:
            return substring
    return None


def get_company_names(company, relation_entity, relation_entity_type, company_list):
    if not pd.isna(company):
        possible_company_names = [company.lower()]
    else:
        possible_company_names = []
    company_names = []
    if relation_entity_type == 'company' and not pd.isna(relation_entity):
        possible_company_names.append(relation_entity.lower())
    for possible_company_name in possible_company_names:
        if possible_company_name in company_list:
            company_name = possible_company_name
            company_names.append(possible_company_name)
        else:
            possible_company_names = possible_company_name.split(' ')
            company_name = find_company_name(possible_company_names, company_list)
            if company_name is not None:
                company_names.append(company_name)
    return company_names


def company_similarity(row, company_embedding_dict):
    company_list = list(company_embedding_dict.keys())
    companies1 = get_company_names(row['company1'], row['relationEntity1'], row['relationEntityType1'], company_list)
    companies2 = get_company_names(row['company2'], row['relationEntity2'], row['relationEntityType2'], company_list)
    
    embeddings1 = aggregate_embeddings(companies1, company_embedding_dict)
    embeddings2 = aggregate_embeddings(companies2, company_embedding_dict)
        
    if companies1 == [] or companies2 == []:
        similarity = row['similarity']
    else:
        similarity = cosine_similarity(embeddings1, embeddings2)
    row['company_similarity'], row['companies1'], row['companies2'] = convert_ndarray(similarity), companies1, companies2
    
    return row

def keyword_with_company_similarity(row, keyword_dict, company_embedding_dict):
    '''
    If the keyword is a company name, use company embeddings instead of SBERT
    '''
    company_list = list(company_embedding_dict.keys())
    temp_dict_1 = {}
    temp_dict_2 = {}
    
    for kw in row['keywords1']: 
        if kw in company_list:
            temp_dict_1[kw] = company_embedding_dict[kw]
        elif kw in keyword_dict.keys():
            temp_dict_1[kw] = keyword_dict[kw]
                
    for kw in row['keywords2']: 
        if kw in company_list:
            temp_dict_2[kw] = company_embedding_dict[kw]
        elif kw in keyword_dict.keys():
            temp_dict_2[kw] = keyword_dict[kw]
            
    embeddings1 = aggregate_embeddings(temp_dict_1.keys(), temp_dict_1)
    embeddings2 = aggregate_embeddings(temp_dict_2.keys(), temp_dict_2)
        
    if list(temp_dict_1.keys()) == [] or list(temp_dict_2.keys()) == []:
        similarity = row['similarity']
    else:
        similarity = cosine_similarity(embeddings1, embeddings2)
    row['keyword_similarity'] = convert_ndarray(similarity)
    
    return row

def convert_ndarray(x):
    if isinstance(x, np.ndarray):
        return 0.0
    else:
        return x

def company_similarity_df(df, company_embedding_dict, verbose=False):
    df = df.progress_apply(lambda row: company_similarity(row, company_embedding_dict), axis=1)
    if verbose:
        print_similarity_samples(df, 'company_similarity', sample_size=5)
    return df

def keyword_similarity_with_companies_df(df, keyword_dict, company_dict, verbose=False):
    df = df.progress_apply(lambda row: keyword_with_company_similarity(row, keyword_dict, company_dict), axis=1)
    if verbose:
        print_similarity_samples(df, 'keyword_similarity', sample_size=5)
    return df


def column_similarity(row, column_name, embedding_dict):
    '''
    If keywords or classification are empty, set the simialrity same as similarity of the whole sentence
    '''
    column1, column2 = row[f'{column_name}1'], row[f'{column_name}2']
    if type(column1) != list:
        column1 = [column1]
        column2 = [column2]
        if pd.isna(column1) or pd.isna(column2):
            row[f'{column_name}_similarity'] = row['similarity']
            return row
    else:
        if len(column1) == 0 or len(column2) == 0:
            row[f'{column_name}_similarity'] = row['similarity']
            return row
            
    to_lower = column_name != 'country'
    embeddings1 = aggregate_embeddings(column1, embedding_dict, to_lower=to_lower)
    embeddings2 = aggregate_embeddings(column2, embedding_dict, to_lower=to_lower)
    row[f'{column_name}_similarity'] = convert_ndarray(cosine_similarity(embeddings1, embeddings2))
    
    return row

def preprocess_column(df, column_name, words_to_remove=[]):
    df[f'{column_name}1'] = df[f'{column_name}1'].apply(lambda x: x if type(x) == list else literal_eval(x))
    df[f'{column_name}2'] = df[f'{column_name}2'].apply(lambda x: x if type(x) == list else literal_eval(x))
    if tokens_to_remove and len(tokens_to_remove) > 0:
        for token_to_remove in tokens_to_remove:
            df[f'{column_name}1'] = df[f'{column_name}1'].apply(lambda tokens: [token for token in tokens if token != token_to_remove] if token_to_remove in tokens else tokens)
            df[f'{column_name}2'] = df[f'{column_name}2'].apply(lambda tokens: [token for token in tokens if token != token_to_remove] if token_to_remove in tokens else tokens)
    return df

def preprocess_classification(df):
    df['classification1'] = df['classification1'].apply(lambda x: [] if 'entity' in x or 'other' in x else x)
    df['classification2'] = df['classification2'].apply(lambda x: [] if 'entity' in x or 'other' in x else x)
    df['classification1'] = df['classification1'].apply(lambda x: x if type(x) == list else literal_eval(x))
    df['classification2'] = df['classification2'].apply(lambda x: x if type(x) == list else literal_eval(x))
    return df

In [11]:
# from collections import Counter

# all_keywords = ai_news_df['keywords'].str.split('|').explode().tolist()
# word_count = Counter(all_keywords)

# # Print the word frequencies
# for word, count in word_count.items():
#     print(f'{word}: {count}')


def run_pipeline():
    if params['CREATE_NEW_SPLIT']:
        train_datasets, val_datasets, test_datasets = [], [], []
        if 'consulting' in params['DATASETS']:
            big_consulting_dataset_dir = '../dataset/big_consulting/'
            big_consulting_df = get_big_consulting_df(params)
            big_consulting_train_df, big_consulting_val_df, big_consulting_test_df = get_split_df(params, df=big_consulting_df, dataset_dir=big_consulting_dataset_dir)
            train_datasets.append(big_consulting_train_df)
            val_datasets.append(big_consulting_val_df)
            test_datasets.append(big_consulting_test_df)

        if 'ai' in params['DATASETS']:
            ai_news_dataset_dir = '../dataset/ai_news/'
            ai_news_df = get_news_df(params, 'ai_news')
            ai_news_train_df, ai_news_val_df, ai_news_test_df = get_split_df(params, df=ai_news_df, dataset_dir=ai_news_dataset_dir)
            train_datasets.append(ai_news_train_df)
            val_datasets.append(ai_news_val_df)
            test_datasets.append(ai_news_test_df)

        if 'car' in params['DATASETS']:
            car_news_dataset_dir = '../dataset/car_news/'
            car_news_df = get_news_df(params, 'car_news')
            car_news_train_df, car_news_val_df, car_news_test_df = get_split_df(params, df=car_news_df, dataset_dir=car_news_dataset_dir)
            train_datasets.append(car_news_train_df)
            val_datasets.append(car_news_val_df)
            test_datasets.append(car_news_test_df)

        train_df = pd.concat(train_datasets, axis=0).reset_index(drop=True)
        val_df = pd.concat(val_datasets, axis=0).reset_index(drop=True)
        test_df = pd.concat(test_datasets, axis=0).reset_index(drop=True)

        train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
        val_df['similarity'] = val_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
        test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)


        params["COLUMNS"] = train_df.columns

        prefix = "../glanos-data/embeddings/"

        with open(f'{prefix}ai_news_replace_no_tags.pickle', 'rb') as f:
            replace_no_tags_embeddings = pickle.load(f)
            with open(f'{prefix}car_news_replace_no_tags.pickle', 'rb') as f:
                replace_no_tags_embeddings.update(pickle.load(f))

        test_df = create_replace_no_tags_embeddings(test_df, replace_no_tags_embeddings)
        val_df = create_replace_no_tags_embeddings(val_df, replace_no_tags_embeddings)
        train_df = create_replace_no_tags_embeddings(train_df, replace_no_tags_embeddings)


        test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
        val_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
        train_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)


    else:
        train_df, val_df, test_df = read_dataset()
        
    prefix = '../glanos-data/embeddings/'
    with open(f'{prefix}ai_news_snippet.pickle', 'rb') as f:
        ai_news_dict = pickle.load(f)
    with open(f'{prefix}car_news_snippet.pickle', 'rb') as f:
        car_news_dict = pickle.load(f)
    embeddings_dict = ai_news_dict
    embeddings_dict.update(car_news_dict)

    train_df["embedding1"] = train_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
    train_df["embedding2"] = train_df["snippet2"].apply(lambda x: embeddings_dict.get(x))
    val_df["embedding1"] = val_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
    val_df["embedding2"] = val_df["snippet2"].apply(lambda x: embeddings_dict.get(x))
    test_df["embedding1"] = test_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
    test_df["embedding2"] = test_df["snippet2"].apply(lambda x: embeddings_dict.get(x))

    train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
    val_df['similarity'] = val_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
    test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)

#     if 'keywords' in params["SIMILARITY"]:
#         with open(f'{prefix}{params["COMPANY_EMBEDDINGS"]}.pickle', 'rb') as f:
#             company_embedding_dict = pickle.load(f)
#         print('company_embedding_dict', len(company_embedding_dict))
#         with open(f'{prefix}kw_embedding_dict.pickle', 'rb') as f:
#             kw_embeddings = pickle.load(f)
#         test_df = preprocess_column(test_df, 'keywords', words_to_remove=['artificial intelligence'])
#         val_df = preprocess_column(val_df, 'keywords', words_to_remove=['artificial intelligence'])
#         train_df = preprocess_column(train_df, 'keywords', words_to_remove=['artificial intelligence'])
#         test_df = keyword_similarity_with_companies_df(test_df, kw_embeddings, company_embedding_dict)
#         val_df = keyword_similarity_with_companies_df(val_df, kw_embeddings, company_embedding_dict)
#         train_df = keyword_similarity_with_companies_df(train_df, kw_embeddings, company_embedding_dict)

#     if 'classification' in params["SIMILARITY"]:
#         if "ai" in params["DATASETS"] and "car" in params["DATASETS"]:
#             with open(f'{prefix}car_news_classification.pickle', 'rb') as f:
#                 classsification_embeddings = pickle.load(f)
#                 with open(f'{prefix}ai_news_classification.pickle', 'rb') as f:
#                     classsification_embeddings.update(pickle.load(f))
#         else:
#             with open(f'{prefix}class_embedding_dict.pickle', 'rb') as f:
#                 classsification_embeddings = pickle.load(f)

#         test_df = preprocess_column(test_df, 'classification', words_to_remove=['entity', 'other'])
#         val_df = preprocess_column(val_df, 'classification', words_to_remove=['entity', 'other'])
#         train_df = preprocess_column(train_df, 'classification', words_to_remove=['entity', 'other'])
#         test_df = test_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)
#         val_df = val_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)
#         train_df = train_df.progress_apply(lambda row: column_similarity(row, 'classification', classsification_embeddings), axis=1)

#     if 'company' in params["SIMILARITY"] and not "company_similarity" in train_df.columns:
#         with open(f'{prefix}{params["COMPANY_EMBEDDINGS"]}.pickle', 'rb') as f:
#             company_embedding_dict = pickle.load(f)

#         test_df = company_similarity_df(test_df, company_embedding_dict)
#         val_df = company_similarity_df(val_df, company_embedding_dict)
#         train_df = company_similarity_df(train_df, company_embedding_dict)

#     if 'country' in params["SIMILARITY"]:
#         country_embeddings = country_code_map()
#         test_df = test_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)
#         val_df = val_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)
#         train_df = train_df.progress_apply(lambda row: column_similarity(row, 'country', country_embeddings), axis=1)

    save_dataset(train_df, val_df, test_df, params)
    


# Experiment 1

In [12]:
# params["DATASETS"] = ["consulting"]
# params["SIMILARITY"] = ["company"]
# for definition_weight in np.arange(0.0, 1.1, 0.1):
#     suffix = str(definition_weight)[:3]
#     params["OUTPUT_DIR"] = f"../similarity-training-data/consulting/glanos-company-{suffix}/"
#     params["COMPANY_EMBEDDINGS"] = f"company_embedding_dicts_glanos_{suffix}"
#     run_pipeline()


# Experiment 2

In [17]:
params["SIMILARITY"] = ["company", "keywords", "classification"]
tokens_to_remove = None
params["INPUT_DIR"] = f"../similarity-training-data/ai_car/replace_no_tags/sbert-company-0.0/"
params["OUTPUT_DIR"] = f"../similarity-training-data/ai_car/snippet/sbert-company-0.0/"
params["COMPANY_EMBEDDINGS"] = f"company_embedding_dicts_sbert_0.0"
# run_pipeline()

# Update similarity
test_df = pd.read_csv(f'{params["INPUT_DIR"]}test.tsv', sep='\t', index_col=0)
val_df = pd.read_csv(f'{params["INPUT_DIR"]}val.tsv', sep='\t', index_col=0)
train_df = pd.read_csv(f'{params["INPUT_DIR"]}train.tsv', sep='\t', index_col=0)
        
prefix = '../glanos-data/embeddings/'
with open(f'{prefix}ai_news_snippet.pickle', 'rb') as f:
    ai_news_dict = pickle.load(f)
with open(f'{prefix}car_news_snippet.pickle', 'rb') as f:
    car_news_dict = pickle.load(f)
embeddings_dict = ai_news_dict
embeddings_dict.update(car_news_dict)

test_df["embedding1"] = test_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
test_df["embedding2"] = test_df["snippet2"].apply(lambda x: embeddings_dict.get(x))
test_df['similarity'] = test_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
test_df = test_df.drop(columns=['embedding1', 'embedding2'])
test_df.to_csv(f'{params["OUTPUT_DIR"]}test.tsv', sep='\t')

val_df["embedding1"] = val_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
val_df["embedding2"] = val_df["snippet2"].apply(lambda x: embeddings_dict.get(x))
val_df['similarity'] = val_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
val_df = val_df.drop(columns=['embedding1', 'embedding2'])
val_df.to_csv(f'{params["OUTPUT_DIR"]}val.tsv', sep='\t')

train_df["embedding1"] = train_df["snippet1"].apply(lambda x: embeddings_dict.get(x))
train_df["embedding2"] = train_df["snippet2"].apply(lambda x: embeddings_dict.get(x))
train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
train_df = train_df.drop(columns=['embedding1', 'embedding2'])
train_df.to_csv(f'{params["OUTPUT_DIR"]}train.tsv', sep='\t')


Columns (14,15,16,17,18,19) have mixed types. Specify dtype option on import or set low_memory=False.

100%|█████████████████████████████████████████████| 200/200 [00:00<00:00, 20883.81it/s]
100%|█████████████████████████████████████████████| 200/200 [00:00<00:00, 21092.27it/s]
100%|███████████████████████████████████████| 219815/219815 [00:08<00:00, 25787.20it/s]


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')

def split_dataframe(df, value):
    df_1 = df.loc[:, df.columns.str.contains(value)]
    df_2 = df.loc[:, ~df.columns.str.contains(value)]
    return df_1, df_2

def remove_trailing_number(column):
    if column[-1] in ['1', '2']:
        return column[:-1]
    return column

df_1, df_2 = split_dataframe(train_df, '1')
# Remove trailing number from column names
df_1.columns = df_1.columns.map(remove_trailing_number)
df_2.columns = df_2.columns.map(remove_trailing_number)

# Append the two DataFrames together
combined_df = pd.concat([df_1, df_2], axis=0).reset_index()

def has_verb(sentence, to_print=False):
    tokens = word_tokenize(sentence)
    tags = pos_tag(tokens)
    
    for token, tag in zip(tokens,tags):
        if to_print:
            print(token, ':', tag)
        if tag[1].startswith('VB'):
            return True
    
    return False

def has_no_verb(sentence):
    return not has_verb(sentence)

def has_no_verb_2(sentence):
    return not has_verb(sentence, to_print=True)


# pd.set_option('display.max_colwidth', None)

# # short_snippets = combined_df[(combined_df['snippet'].apply(str).str.len() < 40)][['snippet']] #  | (combined_df['snippet'].apply(str.lower).apply(has_no_verb))
# # print(short_snippets)
# combined_df = combined_df[(combined_df['snippet'].apply(str).str.len() >= 40)] #  | (combined_df['snippet'].apply(str).apply(has_verb))
# len(combined_df)
combined_df

In [9]:
from tqdm import tqdm

if params['READ_SAVED_DATASET']:
    train_df = merge_rows(combined_df, params)


    train_df['embedding2'] = train_df['embedding2'].apply(lambda x: 
                               np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' '))
    train_df['embedding1'] = train_df['embedding1'].apply(lambda x: 
                               np.fromstring(
                                   x.replace('\n','')
                                    .replace('[','')
                                    .replace(']','')
                                    .replace('  ',' '), sep=' '))

    train_df['similarity'] = train_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
    if 'company' in params['COLUMNS']:
        count = len(train_df[train_df['company1'] == train_df['company2']])
        print("Number of rows with the same value for company1 and company2:", count)
        


# merged_df['similarity'] = merged_df.progress_apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)


In [None]:
# '''
# Prepare big dataset with classification and keyword similarities 
# '''



In [11]:
def print_df(df):
    for index, row in df.iterrows():
        print(row['snippet1'])
        print(row['snippet2'])
        print("\n---\n")  # print a line for separation
        
def filter_cols(df):
    desired_columns = ['snippet1', 'snippet2']
    for col in df.columns:
        if 'similarity' in col:
            desired_columns.append(col)
            if '_' in col:
                desired_columns.append(col.split('_')[0]+'1')
                desired_columns.append(col.split('_')[0]+'2')
    if 'label' in df.columns:
        desired_columns.append('label')
    desired_columns = [col for col in desired_columns if col in df.columns]
    df = df.loc[:, desired_columns]
    return df

In [14]:
prefix = '../similarity-training-data/sbert-company-filtered/'
filter_cols(test_df).to_csv(f'{prefix}test.tsv', sep='\t')
filter_cols(val_df).to_csv(f'{prefix}val.tsv', sep='\t')
filter_cols(train_df).to_csv(f'{prefix}train.tsv', sep='\t')

In [17]:
# (-0.001, 0.1]     453
# (0.1, 0.2]       1140
# (0.2, 0.3]       1119
# (0.3, 0.4]        827
# (0.4, 0.5]        821
# (0.5, 0.6]        855
# (0.6, 0.7]        705
# (0.7, 0.8]        423
# (0.8, 0.9]        182
# (0.9, 1.0]        181 
# 1 - basically the same, the only differences are a few extra words in front or after (length check)


# 5 The two sentences are completely equivalent, as they mean the same thing. (i.e. talk about the same company, people, event, values)
# 4 The two sentences are mostly equivalent, but some unimportant details differ. (i.e. same company, people, event but different wording or different values)
# 3 The two sentences are roughly equivalent, but some important information differs/missing. (e.g. same event but different company or same company and similar event)
# 2 The two sentences are not equivalent, but share some details. (e.g. same company but different event)
# 1 The two sentences are not equivalent, but are on the same topic.
# 0 The two sentences are completely dissimilar.

# TODO - slightly modify giving 1 when one is a substring of another

In [65]:
train_df

Unnamed: 0,replace1,snippet1,classification1,keywords1,embedding1,id1,replace2,snippet2,classification2,keywords2,embedding2,id2,replace_no_tags1,replace_no_tags2,similarity,keyword_similarity,classification_similarity
0,"#PERSON, #JOBTITLE of #COMPANY, the holding co...","Sundar Pichai, CEO of Alphabet, the holding co...",[],[artificial intelligence],"[-0.06915243, 0.01455253, 0.024989886, -0.0257...",216067,The WisdomTree #COMPANY Value Fund ETF uses a ...,The WisdomTree International Al Enhanced Value...,[],[artificial intelligence],"[0.022610608, -0.12946172, -0.10784928, -0.035...",182225.0,of the holding company of recently stated in a...,The WisdomTree Value Fund ETF uses a proprieta...,0.564688,1.000000,0.564688
1,Media and Internet holding company #COMPANY is...,IAC + 1: Media and Internet holding company IA...,[],[artificial intelligence],"[-0.002212706, -0.042485017, -0.08343706, -0.0...",7495,"#LOC’s AA-rated IT services group #COMPANY,",And Japan’s AA-rated IT services group NEC Cor...,[],[artificial intelligence],"[-0.016172213, -0.011744792, -0.04392088, 0.00...",149398.0,Media and Internet holding company is implemen...,AA-rated IT services group,0.167768,1.000000,0.167768
2,"over the next few years, since the new technol...",Gen-1 is driving the next supercycle of cloud ...,[employ],[artificial intelligence],"[-0.067335255, -0.00187942, 0.0024513635, -0.0...",49247,"#COMPANY , a fast-growing #LOC-based startup w...","Hugging Face , a fast-growing New York-based s...",[],[artificial intelligence],"[-0.0582173, -0.05877296, 0.0024392686, 0.0074...",90130.0,"over the next few years, since the new technol...",", a fast-growing startup which enjoyed a valua...",0.252154,1.000000,0.252154
3,#COMPANY launches Skills-based Talent planning...,Eightfold 1 Launches Skills-Based Talent Plann...,[],[artificial intelligence],"[-0.056243468, -0.0053915293, 0.011274071, 0.0...",231148,As ChatGPT – developed by #COMPANY is not avai...,As 1 – developed by Microsoft-backed 1 – is no...,[],"[chatgpt, artificial intelligence, openai]","[-0.047819942, -0.0714631, 0.047958616, -0.007...",214641.0,"launches Skills-based Talent planning, empower...",As ChatGPT – developed by is not available for...,0.002902,0.729112,0.002902
4,The #COMPANY is designed to simplify video pro...,The 1 Video Generator is designed to simplify ...,[],[artificial intelligence],"[-0.049500536, 0.021719, 0.010959194, -0.05274...",61653,"chatbots like ChatGPT and #COMPANY Bard, may b...","the used in chatbots like 1 and 1, may be maki...",[],"[chatgpt, google bard]","[-0.021107346, -0.07309405, 0.0432339, -0.0501...",46199.0,The is designed to simplify video production a...,"chatbots like ChatGPT and Bard, may be making ...",0.122614,0.198170,0.122614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219810,Because #PERSON ’ s contract with #COMPANY inc...,Because Graham Ford ’ s contract with FCS inc...,[],[ford],"[-0.08120356, 0.042070776, 0.094466455, 0.0682...",87539,#COMPANY and #COMPANY had agreed for a #ORG,2023 Toyota Motor Corporation and Suzuki Motor...,[agreement],[suzuki],"[-0.0480265, 0.023894105, 0.027636005, 0.00332...",168350.0,Because ’ s contract with included both conven...,and had agreed for a,,0.709216,
219811,#COMPANY Get Free Report on #DATE,General Motors (GM) - Get Free Report on April...,[],[general motors],"[-0.059067983, 0.03613742, -0.06019407, 0.0033...",137416,The #COMPANY’s commitment to achieving full cl...,The BMW Group’s commitment to achieving full c...,[],[bmw],"[0.017645555, 0.110141344, 0.0147629045, 0.006...",54909.0,Get Free Report on,The commitment to achieving full climate neutr...,,0.983244,
219812,A #COMPANY expert will now assume a larger #JO...,A Volvo Bus expert will now assume a larger di...,[],[volvo],"[-0.03856746, 0.023804769, 0.021807693, -0.019...",110326,Reports suggest #PERSON’s private plan landed ...,Reports suggest Musk’s private plan landed in ...,[],[tesla],"[0.060868353, 0.018765314, -0.038085416, 0.013...",119948.0,A expert will now assume a larger role with later,Reports suggest private plan landed in the Chi...,,0.953276,
219813,Automotive manufacturer #COMPANY announced tod...,Automotive manufacturer Mercedes-Benz announce...,[market_loc],[mercedes],"[-0.08559719, 0.06645149, 0.031096734, 0.02838...",103746,"By #DATE, #COMPANY aims to have reduced the en...","By 2030, BMW aims to have reduced the entire c...",[],[bmw],"[0.01392833, 0.1668224, 0.042746324, 0.0332090...",132008.0,Automotive manufacturer announced today new ag...,By aims to have reduced the entire carbon foot...,,0.954971,
