In [1]:
import pickle
import pandas as pd
import plotly.offline as pyo
import plotly.express as px
import dimensionality_reduction
import json 
import random

# Load the pickled dictionary
with open('glanos-data/big_consulting_export-sbert.pickle', 'rb') as f:
    embeddings = pickle.load(f)

json_file_path = "glanos-data/big_consulting_export.json"
tsv_path = "glanos-data/big_consulting_export.tsv"

with open(json_file_path, 'r') as j:
     contents = json.loads(j.read())
df_details = pd.read_csv(tsv_path, delimiter='\t')
df_details = df_details.drop(columns=['snippet', 'date'])

# Get the list of item objects
items = contents['items']
df = pd.DataFrame(items)
df["embedding"] = df["snippet"].map(embeddings)

df = pd.concat([df, df_details], axis=1)

In [2]:
# Combine rows into pairs
df1 = df.iloc[::2].reset_index(drop=True)  # odd-indexed rows
df2 = df.iloc[1::2].reset_index(drop=True)  # even-indexed rows

# rename the columns
df1.columns = [f"{col}1" for col in df1.columns]
df2.columns = [f"{col}2" for col in df2.columns]

# concatenate side by side
merged_df = pd.concat([df1, df2], axis=1)


In [3]:
from numpy import dot
from numpy.linalg import norm

# function to calculate cosine similarity
def cosine_similarity(a, b):
    return dot(a, b)/(norm(a)*norm(b))

# Function to check if 'snippet1' is a substring of 'snippet2' or vice versa
def check_substring(row):
    if row['snippet1'] in row['snippet2'] or row['snippet2'] in row['snippet1']:
        return 1.0
    else:
        return row['similarity']  # return the existing similarity if neither is a substring of the other

# apply the function to each row of the dataframe
merged_df = merged_df.dropna(subset=['snippet1', 'snippet2'])
merged_df['similarity'] = merged_df.apply(lambda row: cosine_similarity(row['embedding1'], row['embedding2']), axis=1)
merged_df['similarity'] = merged_df.apply(check_substring, axis=1)

merged_df

Unnamed: 0,tooltip1,score1,snippet1,id1,embedding1,company1,relationEntity1,relationEntityType1,classification1,country1,...,snippet2,id2,embedding2,company2,relationEntity2,relationEntityType2,classification2,country2,keywords2,similarity
0,(2023-04-21) Strategy|Company Info\n\nat KPMG ...,1.0,at KPMG where he focused on complex financial ...,ID0,"[0.06154659017920494, 0.08247778564691544, -0....",KPMG A/S,,,Strategy|Company Info,US,...,LatentView has been recognized as an industry ...,ID1,"[-0.017490660771727562, -0.036421071738004684,...",Gartner Inc.,,,Leadership,IN,leader|industry leader,0.209001
1,"(2023-04-21) \n\nIn his last role, Prashant wa...",0.8,"In his last role, Prashant was playing the rol...",ID2,"[0.07539796829223633, -0.025188380852341652, -...",Infosys Limited,Prashant Ramanujan,person,,JO,...,We're proud of the end result of this implemen...,ID3,"[-0.030860286206007004, 0.047984544187784195, ...",Fti Consulting Inc. Holding,Palmer Richard,person,,IE,director|segment|managing director|technology ...,0.171763
2,(2023-04-21) Market Share Growth\n\nWipro cons...,1.0,Wipro consolidates presence in foods with acqu...,ID4,"[0.08279228955507278, 0.027778802439570427, -0...",Wipro Limited,,,Market Share Growth,IN,...,Days after reports claimed that Tata Consultan...,ID5,"[0.001736283185891807, 0.08078806847333908, -0...",Tata Consultancy Services Limited,,,,IN,payout|variable payout|section,0.172178
3,(2023-04-21) Agreements\n\nmessenger RNA (mRNA...,1.0,messenger RNA (mRNA) therapeutics and vaccines...,ID6,"[-0.03147133067250252, 0.022890880703926086, 0...",IBM Corp.,,,Agreements,IN,...,"Under the agreement, IBM will provide access t...",ID7,"[-0.06426975876092911, 0.04572831094264984, 0....",IBM Corp.,,,Company Info,US,quantum computing system|agreement|access|syst...,0.537603
4,(2023-04-21) \n\nIBM's purpose is to be the ca...,0.6,IBM's purpose is to be the catalyst,ID8,"[0.01070199254900217, 0.09421855956315994, -0....",IBM Corp.,,,,IN,...,Moderna and IBM scientists will apply MoLForme...,ID9,"[-0.07136135548353195, -0.010682446882128716, ...",IBM Corp.,Moderna Holding B.V.,company,,US,molformer,0.329646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6747,(2023-03-29) Acquistions\n\nIBM announced the ...,1.0,"IBM announced the acquisition of StepZen, whic...",ID13494,"[-0.05444255471229553, 0.04415850713849068, -0...",IBM Corp.,Stepzen,company,Acquistions,AE,...,IBM has acquired 30 companies to bolster its h...,ID13495,"[0.07958270609378815, -0.05470773205161095, -0...",IBM Corp.,,,Product Growth,AE,company,0.435178
6748,(2023-03-29) \n\nS4 Capital was forced to dela...,0.8,S4 Capital was forced to delay its results twi...,ID13496,"[-0.037627097219228745, 0.015242413617670536, ...",PricewaterhouseCoopers International Limited,S4 Capital,company,,US,...,the way IBM enacted a split insurer transactio...,ID13497,"[0.05484054237604141, 0.09920531511306763, -0....",IBM Corp.,,,,US,insurer transaction|transaction|participant|sp...,0.199326
6749,(2023-03-29) \n\nAccenture and McKinsey aren’t...,0.8,Accenture and McKinsey aren’t the only consult...,ID13498,"[0.06299503147602081, -0.02321638911962509, -0...",Accenture Inc.,McKinsey & Co.,company,,AU,...,Professional services firmAccenture has flagge...,ID13499,"[-0.005730913951992989, -0.0424995943903923, -...",Accenture Inc.,,,,AU,cut|workforce|job,0.320224
6750,(2023-03-29) \n\nAccenture stated that it had ...,0.8,Accenture stated that it had put aside $1.5 bi...,ID13500,"[0.03951822593808174, 0.005480567459017038, 0....",Accenture Inc.,,,,AU,...,"Meanwhile, Accenture last week closed two deals",ID13501,"[0.010188382118940353, -0.0003712352190632373,...",Accenture Inc.,,,,AU,deal,0.477704


In [4]:
import numpy as np

def print_similarity_samples(df, similarity_column, sample_size=10):
#     mask = df[similarity_column].apply(lambda x: isinstance(x, float))
#     df = merged_df[mask]
    df = df.dropna(subset=[similarity_column])
    sorted_df = df.sort_values(by=similarity_column, ascending=False)
    display_df = sorted_df[['snippet1', 'snippet2', similarity_column]]

    intervals = np.arange(0, 1.1, 0.1) 
    result_df = pd.DataFrame()
    for i in range(len(intervals) - 1):
        # Get start and end of interval
        start = intervals[i]
        end = intervals[i + 1]

        # Select rows where similarity is within the interval
        mask = (display_df[similarity_column] >= start) & (display_df[similarity_column] < end)
        selected_rows = display_df[mask]

        # If more than 10 rows are selected, randomly choose 10
        if selected_rows.shape[0] > sample_size:
            selected_rows = selected_rows.sample(n=sample_size)

        # Append the selected rows to the result DataFrame
        result_df = pd.concat([result_df, selected_rows])

    # Reset index of the result DataFrame
    result_df = result_df.reset_index(drop=True)


    # Iterate over the first 10 rows
    for index, row in result_df.iterrows():
        print(row['snippet1'])
        print(row['snippet2'])
        print(row[similarity_column])
        print("\n---\n")  # print a line for separation


In [5]:
# Load dictionaries for companies
import pickle

def load_pickle_file(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

# Assuming the files are in the current directory
company_kw_vec = load_pickle_file('glanos-data/company.kw.vec')
# company_kw_vec_hr = load_pickle_file('glanos-data/company.kw.vec.hr')
company_short_definitions_kw_vec = load_pickle_file('glanos-data/company_short_definitions.kw.vec')
# company_short_definitions_kw_vec_hr = load_pickle_file('glanos-data/company_short_definitions.kw.vec.hr')

In [6]:
cn_descriptors_df = pd.read_csv('cn_descriptors_top.csv', sep='\t').drop(columns=['Unnamed: 6'])
cn_descriptors_df

Unnamed: 0,occurances,company,country,definition,additional definitions,keywords
0,1175619,Technavio,GB,technology research company,consultancy|technology research company,406:confident strategic decisions|403:healthca...
1,1111946,Cable News Network,US,,WarnerMedia company,1111917:warnermedia|13:warner media|6:states|6...
2,336949,Iiroc,CA,self-regulatory company,self-regulatory company,336866:investment dealers|330014:equity market...
3,242464,Rosen,US,law company,investor rights law company,241104:global investor rights|200641:purchaser...
4,196238,Schall,US,,shareholder rights litigation company,195736:national shareholder rights|179758:viol...
...,...,...,...,...,...,...
187934,50,Cdp North America,,research company,disclosure platform research company|investmen...,136:major corporations|136:financial markets|4...
187935,50,Money Carer Foundation,,social company,social company,33:own financial affairs|33:vulnerable adults|...
187936,50,Emerging Markets Private Equity,,trade company,capital company|trade company|trade private in...,40:emerging markets|36:changes|6:private inves...
187937,50,Bryght Ai,,intelligence company,conversational intelligence company|scoring pl...,2:research services


In [7]:
def aggregate_embeddings(keys, embeddings_dict):
    embeddings = []
    for key in keys:
        embeddings.append(embeddings_dict[key])
    return np.mean(embeddings, axis=0)

def get_embedding(company, definition, additional_definitions):
    if pd.isna(additional_definitions) and pd.isna(definition):
        return np.nan
    additional_definitions = additional_definitions.replace(' ', '-').split('|')
    if pd.isna(definition):
        return aggregate_embeddings(additional_definitions, company_kw_vec)
    definition = definition.replace(' ', '-')
    definition_weight = 0.6
    def_embedding = np.array(company_kw_vec[definition])
    return (1-definition_weight)*aggregate_embeddings(additional_definitions, company_kw_vec)+definition_weight*def_embedding
      
cn_descriptors_df['embedding'] = cn_descriptors_df.apply(lambda row: get_embedding(row['company'], row['definition'], row['additional definitions']), axis=1)

In [8]:
company_embedding_dict = cn_descriptors_df.set_index('company')['embedding'].to_dict()
company_embedding_dict = {k: v for k, v in company_embedding_dict.items() if not pd.isna(k)}
company_embedding_dict = {k.lower(): v for k, v in company_embedding_dict.items()}

In [9]:
def find_company_name(words, company_list):
    for i in range(len(words), 0, -1):
        substring = ' '.join(words[:i])
        if substring in company_list:
            return substring
    return None

def get_company_names(company, relation_entity, relation_entity_type, company_list):
#     print('company', company)
    if not pd.isna(company):
        possible_company_names = [company.lower()]
    else:
        possible_company_names = []
    company_names = []
    if relation_entity_type == 'company' and not pd.isna(relation_entity):
        possible_company_names.append(relation_entity.lower())
    for possible_company_name in possible_company_names:
        if possible_company_name in company_list:
            company_name = possible_company_name
            company_names.append(possible_company_name)
        else:
            possible_company_names = possible_company_name.split(' ')
            company_name = find_company_name(possible_company_names, company_list)
            if company_name is not None:
                company_names.append(company_name)
    return company_names

def company_similarity(snippet1, company1, relation_entity1, relation_entity_type1, snippet2, company2, relation_entity2, relation_entity_type2):
    company_list = list(company_embedding_dict.keys())
    companies1 = get_company_names(company1, relation_entity1, relation_entity_type1, company_list)
    companies2 = get_company_names(company2, relation_entity2, relation_entity_type2, company_list)
    
    embeddings1 = aggregate_embeddings(companies1, company_embedding_dict)
    embeddings2 = aggregate_embeddings(companies2, company_embedding_dict)
    if companies1 == [] or companies2 == []:
        similarity = 0.0
    else:
        similarity = cosine_similarity(embeddings1, embeddings2)
    return similarity, companies1, companies2

merged_df['company_sim_data'] = merged_df.apply(lambda row: company_similarity(row['snippet1'], row['company1'], row['relationEntity1'], row['relationEntityType1'], row['snippet2'], row['company2'], row['relationEntity2'], row['relationEntityType2']), axis=1)
merged_df[['company_similarity', 'companies1', 'companies2']] = pd.DataFrame(merged_df['company_sim_data'].tolist(), index=merged_df.index)
merged_df = merged_df.drop(columns=['company_sim_data'])
print_similarity_samples(merged_df, 'company_similarity', sample_size=5)


Mean of empty slice.


invalid value encountered in double_scalars



AON Italia, the Italian unit of US insurer AON, reported revenues
IBM is already on pace for a substantial breakthrough by 2026
0.0

---

Siemens Digital Industries Software and IBM are expanding their long-term partnership by collaborating to develop a combined software solution integrating their respective offerings for systems engineering, service lifecycle management and asset management
Prior to joining Crocs, Mr. Rees served as Managing Director of L.E.K. Consulting in Boston where he founded and led the firm's Retail and Consumer Products Practice for 13 years
0.0

---

Canada also has a Health Data Strategy
GF alleged that IBM disclosed GF IP and trade secrets to IBM partners, includingINTC
0.28775256795742404

---

Gartner expects higher IT spending despite economic headwinds
Tata Consultancy Engineers Ltd. will create the allied amenities
0.24180523895349207

---

Prior to joining Guaranty Trust Bank, Mr. Agbaje served as an auditor at Ernst & Young LLP from 1988 to 1990
Walk

In [10]:
import pycountry
from sentence_transformers import SentenceTransformer
import pickle

sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")

def country_code_map():
    code_to_country = {}
    organizations = {'EU': 'European Union'}
    for country in pycountry.countries:
        code_to_country[country.alpha_2] = sbert_model.encode(country.name)
    for org_abbreviation in organizations.keys():
        code_to_country[org_abbreviation] = sbert_model.encode(organizations[org_abbreviation])
    return code_to_country
code_map = country_code_map()
    
def country_similarity(country1, country2):
    if pd.isna(country1) or pd.isna(country2):
        return 0.0
    embeddings1 = aggregate_embeddings([country1], code_map)
    embeddings2 = aggregate_embeddings([country2], code_map)
    similarity = cosine_similarity(embeddings1, embeddings2)
    return similarity

merged_df['country_similarity'] = merged_df.apply(lambda row: country_similarity(row['country1'], row['country2']), axis=1)
print_similarity_samples(merged_df, 'country_similarity', sample_size=5)

[Found on gartner.com] Gartner Predicts Half of Sales Leadership Roles Will
[Found on gartner.com] Gartner Identifies 5 Top Use Cases for AI in Corporate Finance
0.0

---

[Found on gartner.com] We continue to recommend caution
The lowest return has been given by Infosys at -25% and the highest
0.0

---

Why IBM has struggled
should IBM prioritize its dividend?
0.0

---

[Found on bain.com] Eventually, in the run up to COP 28 next year, we aspire to release a roadmap and toolkit for the change hungry climate leaders of the regions for supporting their capabilities to become more sustainable
[Found on bain.com] Bain & Company to play an advisory role in projects identified by the WEF in collaboration with its industry advisors,
0.0

---

IBM will post its Q1 earnings for investors which are anticipating the tech firm to deliver EPS
Infosys issued its earnings results on Thursday
0.0

---

IBM Turns In Earnings Beat
Most of IBM’s chip-making facilities were located in New York state, giv

In [11]:
# pd.set_option('display.max_colwidth', None)
def convert_ndarray(x):
    if isinstance(x, np.ndarray):
        return 0.0
    else:
        return x

merged_df['similarity'] = merged_df['similarity'].apply(convert_ndarray)
merged_df['company_similarity'] = merged_df['company_similarity'].apply(convert_ndarray)
merged_df['country_similarity'] = merged_df['country_similarity'].apply(convert_ndarray)
reduced_df = merged_df.drop(columns=['tooltip1', 'score1', 'id1', 'embedding1', 'company1', 'relationEntity1', 'relationEntityType1', 'classification1',  'relationEntity2', 'relationEntityType2', 'classification2', 'keywords2', 'keywords1', 'tooltip2', 'score2', 'id2','embedding2', 'company2'])
reduced_df = reduced_df[['snippet1', 'snippet2', 'similarity', 
         'companies1', 'companies2', 'company_similarity', 
         'country1', 'country2', 'country_similarity']]
reduced_df.to_csv('reduced_snippet_similarity.tsv', sep='\t', index=False)
merged_df.to_csv('snippet_similarity.tsv', sep='\t', index=False)

In [12]:
# Get a train-val-test split
# import pandas as pd
# import numpy as np


# np.random.seed(42)  # Set the seed
# perm = np.random.permutation(reduced_df.index)
# perm_df = reduced_df.reindex(perm)
# test = perm_df.iloc[:100]
# validation = perm_df.iloc[100:200]
# train = perm_df.iloc[200:]

In [13]:
# Define the bins
bins = [i/10 for i in range(11)]  # This creates a list like [0.0, 0.1, 0.2, ..., 1.0]

# Cut the 'similarity' column into bins
reduced_df['similarity_range'] = pd.cut(reduced_df['similarity'], bins=bins, include_lowest=True)

# Count the number of rows in each bin
counts = reduced_df['similarity_range'].value_counts().sort_index()

# Print the counts
print(counts)

similarity_range
(-0.001, 0.1]     453
(0.1, 0.2]       1140
(0.2, 0.3]       1119
(0.3, 0.4]        827
(0.4, 0.5]        820
(0.5, 0.6]        852
(0.6, 0.7]        696
(0.7, 0.8]        402
(0.8, 0.9]        126
(0.9, 1.0]        271
Name: count, dtype: int64


In [20]:
# Run to construct new val, test, and train sets

reduced_df['similarity_range'] = pd.cut(reduced_df['similarity'], bins=bins, include_lowest=True)
np.random.seed(42)
test_indices = reduced_df.groupby('similarity_range').apply(lambda x: x.sample(n=min(len(x), 10))).index.get_level_values(1)
test_df = reduced_df.loc[test_indices]
test_df = test_df.sample(frac=1)  # frac=1 means return all rows (in random order).
test_df = test_df.reset_index(drop=True)
reduced_df = reduced_df.drop(test_indices)
validation_indices = reduced_df.groupby('similarity_range').apply(lambda x: x.sample(n=min(len(x), 10))).index.get_level_values(1)
validation_df = reduced_df.loc[validation_indices]
validation_df = validation_df.sample(frac=1)  # frac=1 means return all rows (in random order).
validation_df = validation_df.reset_index(drop=True)
train_df = reduced_df.drop(validation_indices)

# Print the number of examples in each dataset
print(f"Number of examples in training dataset: {len(train_df)}")
print(f"Number of examples in validation dataset: {len(validation_df)}")
print(f"Number of examples in test dataset: {len(test_df)}")

train_df.to_csv('dataset/train.tsv', sep='\t', index=False)
validation_df.to_csv('dataset/val.tsv', sep='\t', index=False)
test_df.to_csv('dataset/test.tsv', sep='\t', index=False)


Number of examples in training dataset: 6452
Number of examples in validation dataset: 100
Number of examples in test dataset: 100


In [24]:
import pandas as pd

# Load the .tsv files
test_labeled_df = pd.read_csv('dataset/test_labeled.tsv', sep='\t')
val_labeled_df = pd.read_csv('dataset/val_labeled.tsv', sep='\t')

# # Print the size of reduced_df before excluding
# print(f'Original size of reduced_df: {len(reduced_df)}')

# # Get the 'snippet1' values from the loaded dataframes
# test_snippets = test_labeled_df['snippet1']
# val_snippets = val_labeled_df['snippet1']

# # Exclude rows from reduced_df which the same 'snippet1' value have as either one of the loaded dataframes
# train_df = reduced_df[~reduced_df['snippet1'].isin(test_snippets)]
# train_df = train_df[~train_df['snippet1'].isin(val_snippets)]

# # Print the size of reduced_df after excluding
# print(f'Size of reduced_df after excluding: {len(train_df)}')

# Combine 'snippet1' and 'snippet2' into a single string in each dataframe
test_labeled_df['combined_snippets'] = test_labeled_df['snippet1'] + test_labeled_df['snippet2']
val_labeled_df['combined_snippets'] = val_labeled_df['snippet1'] + val_labeled_df['snippet2']
reduced_df['combined_snippets'] = reduced_df['snippet1'] + reduced_df['snippet2']

# Get the 'combined_snippets' values from the loaded dataframes
test_combined = test_labeled_df['combined_snippets']
val_combined = val_labeled_df['combined_snippets']

print(f'Size of reduced_df before excluding: {len(reduced_df)}')

# Exclude rows from reduced_df which have the same 'combined_snippets' value as either one of the loaded dataframes
train_df = reduced_df[~reduced_df['combined_snippets'].isin(test_combined)]
train_df = train_df[~train_df['combined_snippets'].isin(val_combined)]

# Print the size of reduced_df after excluding
print(f'Size of reduced_df after excluding: {len(train_df)}')

# Drop the 'combined_snippets' column as it is no longer needed
train_df = train_df.drop(columns=['combined_snippets'])
train_df.to_csv('dataset/train.tsv', sep='\t', index=False)


Size of reduced_df before excluding: 6552
Size of reduced_df after excluding: 6362


In [28]:
def print_df(df):
    for index, row in df.iterrows():
        print(row['snippet1'])
        print(row['snippet2'])
        print("\n---\n")  # print a line for separation
print_df(val_labeled_df[90:])
# reduced_df

IBM has invested heavily in digital transformation by helping organizations create smarter enterprises through the use of advanced analytics, automation, and cognitive computing
In addition, IBM has invested heavily in digital transformation by helping organizations create smarter enterprises through the use of advanced analytics, automation, and cognitive computing

---

The proposed IBM Facility would be constructed in the Kwinana-Rockingham Strategic Industrial Area on approximately 30 hectares of vacant industrial land leased from the WA Government
The land secured for the proposed IBM Facility is located adjacent to the Kwinana Lithium Hydroxide Refinery which is owned by Tianqi Lithium Energy Australia , a joint venture between IGO and Tianqi Lithium Corporation

---

Mercer today also announced completion of the acquisition of Advance Asset Management Limited, further expanding its capability as a leading investment multi-manager in Australia
Mercer believes in building brighter

In [17]:
# (-0.001, 0.1]     453 - 
# (0.1, 0.2]       1140
# (0.2, 0.3]       1119
# (0.3, 0.4]        827
# (0.4, 0.5]        821
# (0.5, 0.6]        855
# (0.6, 0.7]        705
# (0.7, 0.8]        423
# (0.8, 0.9]        182
# (0.9, 1.0]        181 
# 1 - basically the same, the only differences are a few extra words in front or after (length check)


# 5 The two sentences are completely equivalent, as they mean the same thing. (i.e. talk about the same company, people, event, values)
# 4 The two sentences are mostly equivalent, but some unimportant details differ. (i.e. same company, people, event but different wording or different values)
# 3 The two sentences are roughly equivalent, but some important information differs/missing. (e.g. same event but different company or same company and similar event)
# 2 The two sentences are not equivalent, but share some details. (e.g. same company but different event)
# 1 The two sentences are not equivalent, but are on the same topic.
# 0 The two sentences are completely dissimilar.

# TODO - slightly modify giving 1 when one is a substring of another

In [159]:
import umap
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

def full_country_code_map(include_abbrev=False):
    code_to_country = {}
    organizations = {'EU': 'European Union'}
    for country in pycountry.countries:
        code_to_country[country.name] = sbert_model.encode(country.name)
        if include_abbrev:
            code_to_country[country.alpha_2] = sbert_model.encode(country.alpha_2)
    for org_abbreviation in organizations.keys():
        code_to_country[organizations[org_abbreviation]] = sbert_model.encode(organizations[org_abbreviation])
        if include_abbrev:
            code_to_country[org_abbreviation] = sbert_model.encode(org_abbreviation)
    return code_to_country

full_code_map = full_country_code_map()

# Assuming code_map is your dictionary with keys being tooltip values and values being embeddings
keys = list(full_code_map.keys())
values = np.array(list(full_code_map.values()))

# Scale the features to have mean=0 and variance=1
scaler = StandardScaler()
scaled_values = scaler.fit_transform(values)

# Reduce dimensions
reducer = umap.UMAP(random_state=42)
print("reduced")
embedding = reducer.fit_transform(scaled_values)

# Create a DataFrame for easier plotting
df = pd.DataFrame(embedding, columns=['x', 'y'])
df['tooltip'] = keys

# Create an interactive plot
fig = go.Figure(data=go.Scatter(
    x = df['x'],
    y = df['y'],
    mode = 'markers',
    text = df['tooltip'], # set tooltip to be keys of the dictionary
    marker = dict(
        size = 10,
        color = 'rgba(255, 182, 193, .9)',
        line = dict(width = 2)
    )
))

fig.update_layout(title='UMAP projection of the full_code_map', title_x=0.5)
fig.show()
fig.write_image(f"clustered.svg")
fig.write_html(f"clustered.html",
                   include_plotlyjs=True, full_html=True)


reduced


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_countries(input_country, country_embeddings_map):
    input_embedding = country_embeddings_map[input_country]
    similarities = []

    for country, embedding in country_embeddings_map.items():
        if country != input_country:
            similarity = cosine_similarity([input_embedding], [embedding])[0][0]
            similarities.append((country, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities
