https://www.kaggle.com/datasets/deeguy/twitter-news

In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd

def import_batch_parquet(batch_prefix):
	dfs = [pd.read_parquet(f'{batch_prefix}{i}.parquet') for i in range(5)]
	df = pd.concat(dfs, ignore_index=True)
	return df

def store_batch_parquet(df, batch_prefix):
	chunk_size = len(df) // 5
	for i in range(5):
		start = i * chunk_size
		end = (i + 1) * chunk_size if i < 4 else len(df)
		df.iloc[start:end].to_parquet(f"{batch_prefix}{i}.parquet", engine='pyarrow')




# setup the library

### import data

In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

### process data

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list, 
    covariate_vector_length=384,
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

In [4]:
# if not existing, compute and save M
# M, df_M = engine.compute_M(method='encoding')
# store as batches, you can store as a unique file if you wish
# store_batch_parquet(df_M, 'notebooks/twitter-news/M_')

# if existing, load M
df_M = import_batch_parquet(batch_prefix='notebooks/twitter-news/M_')

engine.load_M(df_M)

In [5]:
# if not existing, compute and save samples_encoded.parquet
# samples_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([[x[0] for x in samples_vectors]], index=['vector']).T
# store_batch_parquet(samples_encoded, 'notebooks/twitter-news/samples_encoded_')

# if already existing, load samples_encoded.parquet
samples_encoded = import_batch_parquet(batch_prefix='notebooks/twitter-news/samples_encoded_')
sample_vectors = samples_encoded.vector.tolist()
nbrs = engine.compute_nbrs(sample_vectors, k=50000)

# semantic tag search

### naive

In [6]:
from collections import Counter
import itertools
import numpy as np

def count_frequency(search_results):
	flat_list = list(itertools.chain.from_iterable(search_results))
	freq_count = Counter(flat_list)
	sorted_freq_count = sorted(freq_count.items(), key=lambda x: x[1], reverse=True)
	return sorted_freq_count

In [52]:
print(count_frequency(sample_list))



In [80]:
import statistics
from tqdm import tqdm
from sentence_transformers.util import cos_sim

query_tag_list = ['trump', 'Putin', 'Russia', 'Politics', 'China', 'FBI']				# .35 data is never considered
# query_tag_list = ['BTC', 'bitcoin', 'crypto', 'investment']							# .16 data is never considered
# query_tag_list = ['100DaysOfCode', 'coding', 'Robotics', 'dailybites', 'Learnings']	# .57 data is never considered

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=True, print_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
len(search_results)
# count_frequency(search_results[0:1000])[0:10]

# traditional search
hard_indices, hard_filter_results = engine.jaccard_tag_filtering(sample_list, query_tag_list)
# print(len(hard_filter_results))

# count_frequency(hard_filter_results[0:1000])[0:10]

### SAMPLES THAT CAN ONLY BE FOUND WITH SIMTAG
missing = list()
for index in indices:
	if index not in hard_indices:
	   	missing.append(sample_list[index])

# score_list = list()
# tag_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=False)
# for missing_sample in tqdm(missing[0:4000]):
# 	sample_vector = engine.encode_query(list_tags=missing_sample, allow_new_tags=False)
# 	score = cos_sim(tag_vector, sample_vector).tolist()[0][0]
# 	score_list.append(score)

def compute_sim_score(query_tag_list, to_compute, benchmark=0, benchmarks_limit=None):

	limit_counter = 0
	temp_benchmark = benchmark

	sample_scores = list()
	for sample_missing in tqdm(to_compute):
		list1 = list()
		for q in query_tag_list:
			q_list = list()
			for m in sample_missing:
				# print(q, m)
				q_list.append(cos_sim(get_vector(q), get_vector(m)).tolist()[0][0])
			list1.append(q_list)
		avg_score = statistics.mean([max(x) for x in list1])

		if avg_score < benchmark:
			limit_counter +=1
			# we set the new benchmark
			if avg_score < temp_benchmark:
				temp_benchmark = avg_score
				# we reset the counter, because we found a new low
				limit_counter = 0

		sample_scores.append(avg_score)

		if benchmarks_limit is not None:
			if limit_counter >= benchmarks_limit:
				# break the cycle
				return sample_scores
		
	return sample_scores

def get_vector(tag):
	return engine.df_M[engine.df_M['tags']==tag]['vector_tags'].values[0]

# compute benchmark
# tag_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=False)
# sample_vector = engine.encode_query(list_tags=hard_filter_results[-1], allow_new_tags=False)
# benchmark = cos_sim(tag_vector, sample_vector).tolist()[0][0]

benchmark_scores = compute_sim_score(query_tag_list, hard_filter_results[-50:])
benchmark = min(benchmark_scores)

to_compute = missing[0:4000]
sample_scores = compute_sim_score(query_tag_list, to_compute, benchmark, benchmarks_limit=350)
df_missing = pd.DataFrame([to_compute, sample_scores]).T.sort_values(1, ascending=False)

df_missing_filtered = df_missing[df_missing[1]>benchmark]
len_missing = len(df_missing_filtered)
len_hard = len(hard_filter_results)

# percentage of valid sample missed
print('samples found:\t\t', len_hard)
print('sample missing:\t\t',len_missing)
print('% of data ignored:\t',round(len_missing/(len_missing+len_hard), 4))

100%|██████████| 50/50 [00:47<00:00,  1.06it/s]
 46%|████▌     | 1842/4000 [02:19<02:43, 13.17it/s]


samples found:		 803
sample missing:		 431
% of data ignored:	 0.3493


In [90]:
import pandas as pd
import plotly.express as px

# create a histogram with 5 bins
fig = px.histogram(x=df_missing[1], nbins=100)
fig.update_xaxes(autorange="reversed")

# show the plot
fig.show()

In [89]:
df_missing_filtered[0].values.tolist()

[['Fbi', 'News', 'DonaldTrump', 'servizisegreti', 'USA', 'Trump'],
 ['FBIRaidsMarALago',
  'FBIRaid',
  'Trump',
  'TrumpIsGoingToJail',
  'FBIRaid',
  'USArmy',
  'news',
  'Trending',
  'DonaldTrump',
  'USA'],
 ['putin', 'carleyshimkus', 'joebiden', 'kamalaharris', 'politics'],
 ['putin',
  'accesshollywood',
  'Charlottesville',
  'covid',
  'Insurrection',
  'law',
  'declassified',
  'fbi'],
 ['USA',
  'DonaldTrump',
  'obstruction',
  'violations',
  'Trump',
  'Espionage',
  'BREAKING',
  'NEWS'],
 ['News', 'Raid', 'Biden', 'Trump', 'politics', 'government'],
 ['Garland',
  'Trump',
  'Biden',
  'Raid',
  'politics',
  'government',
  'news',
  'media'],
 ['Raid', 'Trump', 'Biden', 'News', 'politics', 'media', 'government'],
 ['politics', 'government', 'Biden', 'Trump', 'News', 'Media'],
 ['Trump', 'news', 'government', 'politics', 'media', 'Biden'],
 ['Biden', 'Trump', 'politics', 'government', 'news', 'fakenews', 'media'],
 ['GarrettZiegler',
  'BreakingNews',
  'News',
  'US

### weighted

In [9]:
query_tag_dict = {
    'trump' : 1,
    'democracy' : 0.4,
    'republicans.' : 2
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict, allow_new_tags=True, print_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0:5]

republicans. -> republicans


[['trump', 'gop', 'republicans'],
 ['Republicans', 'Democrat'],
 ['republicans', 'Trump', 'Biden'],
 ['MAGA', 'REPUBLICANS', 'GOP', 'TRUMP'],
 ['GOP', 'Republicans']]

# validation

In [10]:
query_tag_list = ['news', 'Trump']
result_index = 3

# semantic search
query_vector = engine.encode_query(list_tags=query_tag_list)
soft_indices, soft_filter_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = engine.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, remove_max=False)

# traditional search
hard_indices, hard_filter_results = engine.hard_tag_filtering(sample_list, query_tag_list)
hard_indices, hard_filter_results = engine.jaccard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = engine.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)

### lookup data

In [11]:
# semantic tag filtering
# df.iloc[soft_indices]

In [12]:
# traditional tag filtering
# df.iloc[hard_indices]

### visualize flattened results

In [13]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)

### visualize granular results

In [14]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
print()
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)


