In [1]:
import os
import warnings
import numpy as np

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd

def import_batch_parquet(batch_prefix):
	dfs = [pd.read_parquet(f'{batch_prefix}{i}.parquet') for i in range(5)]
	df = pd.concat(dfs, ignore_index=True)
	return df

def store_batch_parquet(df, batch_prefix):
	chunk_size = len(df) // 5
	for i in range(5):
		start = i * chunk_size
		end = (i + 1) * chunk_size if i < 4 else len(df)
		df.iloc[start:end].to_parquet(f"{batch_prefix}{i}.parquet", engine='pyarrow')




# setup the library

### import data

In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

### process data

In [27]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    quantization='int8'
)
tag2index, indexed_sample_list = engine.index_samples(sample_list) # used for validation

In [28]:
# if not existing, compute and save M
# M, df_M = engine.compute_M(method='encoding')
# store as batches, you can store as a unique file if you wish
# store_batch_parquet(df_M, 'notebooks/twitter-news/M_')

# if existing, load M
df_M = import_batch_parquet(batch_prefix='notebooks/twitter-news/M_')

# dot_product

In [29]:
engine.load_M(df_M, covariate_transformation='dot_product')

In [30]:
# if not existing, compute and save samples_encoded.parquet
# sample_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([sample_vectors], index=['vector']).T
# store_batch_parquet(samples_encoded, 'notebooks/twitter-news/quantized_samples_encoded_')

# if already existing, load samples_encoded.parquet
samples_encoded = import_batch_parquet(batch_prefix='notebooks/twitter-news/samples_encoded_')
sample_vectors = samples_encoded.vector.tolist()
nbrs_covariate = engine.compute_nbrs(sample_vectors, k=1000)

# validation

In [161]:
from IPython.display import HTML

def compute_html(query_tag_list, sample_list, search_indices, covariate_indices, k, t, display_code=True, return_code=False):
	'''
	t : the number of samples in tag_search_results from where we pick the dark red words
	'''

	html_list = list()
	# complete_html = ''
	for search_index in range(k):
		
		try:
			sample_index = search_indices[search_index]
			tag_search_results = [sample_list[x] for x in covariate_indices[0:t]]
			data = sample_list[sample_index]
			top_tag_list = set([item for items in tag_search_results[0:t] for item in items])

			score = 0
			html_code = ''

			for word in data:
				
				if word in query_tag_list:
					scaled_intensity = 1
					score += 1
				elif word in top_tag_list:
					scaled_intensity = 0.6
					score += 0.5
				else:
					scaled_intensity = 0
					score -= 0.3
				g = int(20 * (1 - scaled_intensity))
				r = int(255 * scaled_intensity * 0.7)  # adjust the green component
				color = f"rgb({r},{g},0)"
				html_code += f"<span style='background-color:{color}; color:white'><b>{word}<b></span> "
				# html_code += f"<span style='color:{color}'>{word}</span> "

			score = float(round(score, 4))
			html_full = '<b>'+str(score)+' - '+html_code+'<br>'
			
		except Exception as e:
			# print('ERR', e)
			html_full = ''
			# complete_html += ''

		html_list.append([score, html_full])
		
	html_list = sorted(html_list)[::-1]
	html_list = [x[1] for x in html_list]

	if display_code : [display(HTML(x)) for x in html_list]

	if return_code : return html_list

def compute_indices(query_tag_list, tag2index, indexed_sample_list, nbrs_semantic=None, nbrs_covariate=None, search_types=['hard', 'jaccard', 'covariate', 'semantic', 'covariate_semantic']):
	# hard search
	indices_hard = engine.hard_tag_filtering(tag2index, indexed_sample_list, query_tag_list, search_type='AND')

	# jaccard search
	indices_jaccard = engine.hard_tag_filtering(tag2index, indexed_sample_list, query_tag_list, search_type='OR')

	# covariate search
	query_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=True, print_new_tags=True)
	indices_covariate, tag_search_results = engine.soft_tag_filtering(nbrs_covariate, sample_list, query_vector)

	# semantic search
	if nbrs_semantic is not None:
		query_vector = engine.model.encode(str(query_tag_list))
		indices_semantic, search_results = engine.soft_tag_filtering(nbrs_semantic, sample_list, query_vector)
	else:
		indices_semantic = np.array([])

	# covariate semantic search
	try:
		query_vector = engine.model.encode(str(query_tag_list))
		indices_covariate_semantic, search_results = engine.soft_tag_filtering(nbrs_covariate, sample_list, query_vector)
	except:
		indices_covariate_semantic = np.array([])

	indices = dict()
	if 'hard' in search_types:
		indices['hard'] = indices_hard
	if 'jaccard' in search_types:
		indices['jaccard'] = indices_jaccard
	if 'covariate' in search_types:
		indices['covariate'] = indices_covariate
	if 'semantic' in search_types:
		indices['semantic'] = indices_semantic
	if 'covariate_semantic' in search_types:
		indices['covariate_semantic'] = indices_covariate_semantic
	return indices

In [155]:
query_tag_list=['democrats']
indices = compute_indices(query_tag_list=['democracy', 'Republicans'], indexed_sample_list=indexed_sample_list, tag2index=tag2index, nbrs_covariate=nbrs_covariate, search_types=['hard', 'jaccard', 'covariate'])
indices['hard']

array([2247], dtype=int64)

In [162]:
compute_html(query_tag_list, sample_list, indices['hard'], indices['covariate'], k=7, t=7, display_code=True, return_code=False)

In [163]:
compute_html(query_tag_list, sample_list, indices['jaccard'], indices['covariate'], k=7, t=7, display_code=True, return_code=False)

In [164]:
compute_html(query_tag_list, sample_list, indices['covariate'], indices['covariate'], k=7, t=7, display_code=True, return_code=False)

# ignored search data

We compute the amount of data that gets ignored when performing a classic search compared with a covariate search

In [None]:
from collections import Counter
import itertools
import numpy as np

def count_frequency(search_results):
	flat_list = list(itertools.chain.from_iterable(search_results))
	freq_count = Counter(flat_list)
	sorted_freq_count = sorted(freq_count.items(), key=lambda x: x[1], reverse=True)
	return sorted_freq_count

count_frequency(sample_list)

In [None]:
import statistics
from tqdm import tqdm
from sentence_transformers.util import cos_sim

query_tag_list = ['trump', 'Putin', 'Russia', 'Politics', 'China', 'FBI']				# .35 data is never considered
# query_tag_list = ['BTC', 'bitcoin', 'crypto', 'investment']							# .16 data is never considered
# query_tag_list = ['100DaysOfCode', 'coding', 'Robotics', 'dailybites', 'Learnings']	# .57 data is never considered

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=True, print_new_tags=True, ignore_weights=True)
indices, search_results = engine.soft_tag_filtering(nbrs_covariate, sample_list, query_vector)
len(search_results)
# count_frequency(search_results[0:1000])[0:10]

# traditional search
hard_indices, hard_filter_results = engine.jaccard_tag_filtering(sample_list, query_tag_list)

### SAMPLES THAT CAN ONLY BE FOUND WITH SIMTAG
missing = list()
for index in indices:
	if index not in hard_indices:
	   	missing.append(sample_list[index])

def compute_sim_score(query_tag_list, to_compute, benchmark=0, benchmarks_limit=None):

	limit_counter = 0
	temp_benchmark = benchmark

	sample_scores = list()
	for sample_missing in tqdm(to_compute):
		list1 = list()
		for q in query_tag_list:
			q_list = list()
			for m in sample_missing:
				# print(q, m)
				q_list.append(cos_sim(get_vector(q), get_vector(m)).tolist()[0][0])
			list1.append(q_list)
		avg_score = statistics.mean([max(x) for x in list1])

		if avg_score < benchmark:
			limit_counter +=1
			# we set the new benchmark
			if avg_score < temp_benchmark:
				temp_benchmark = avg_score
				# we reset the counter, because we found a new low
				limit_counter = 0

		sample_scores.append(avg_score)

		if benchmarks_limit is not None:
			if limit_counter >= benchmarks_limit:
				# break the cycle
				return sample_scores
		
	return sample_scores

def get_vector(tag):
	return engine.df_M[engine.df_M['tags']==tag]['vector_tags'].values[0]

# compute benchmark
# tag_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=False)
# sample_vector = engine.encode_query(list_tags=hard_filter_results[-1], allow_new_tags=False)
# benchmark = cos_sim(tag_vector, sample_vector).tolist()[0][0]

benchmark_scores = compute_sim_score(query_tag_list, hard_filter_results[-50:])
benchmark = min(benchmark_scores)

to_compute = missing[0:4000]
sample_scores = compute_sim_score(query_tag_list, to_compute, benchmark, benchmarks_limit=350)
df_missing = pd.DataFrame([to_compute, sample_scores]).T.sort_values(1, ascending=False)

df_missing_filtered = df_missing[df_missing[1]>benchmark]
len_missing = len(df_missing_filtered)
len_hard = len(hard_filter_results)

# percentage of valid sample missed
print('samples found:\t\t', len_hard)
print('sample missing:\t\t',len_missing)
print('% of data ignored:\t',round(len_missing/(len_missing+len_hard), 4))

In [None]:
import pandas as pd
import plotly.express as px

# create a histogram with 5 bins
fig = px.histogram(x=df_missing[1], nbins=100)
fig.update_xaxes(autorange="reversed")

# show the plot
fig.show()

In [None]:
# show all search results that cannot be found using traditional search algos
# ex. ['trump', 'Putin', 'Russia', 'Politics', 'China', 'FBI']
df_missing_filtered[0].values.tolist()