In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd

def import_batch_parquet(batch_prefix):
	dfs = [pd.read_parquet(f'{batch_prefix}{i}.parquet') for i in range(5)]
	df = pd.concat(dfs, ignore_index=True)
	return df

def store_batch_parquet(df, batch_prefix):
	chunk_size = len(df) // 5
	for i in range(5):
		start = i * chunk_size
		end = (i + 1) * chunk_size if i < 4 else len(df)
		df.iloc[start:end].to_parquet(f"{batch_prefix}{i}.parquet", engine='pyarrow')

In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    quantization='int8'
)

In [5]:
# if not existing, compute and save M
M, df_M = engine.compute_M(method='encoding')
# store as batches, you can store as a unique file if you wish
# store_batch_parquet(df_M, 'notebooks/twitter-news/M_')

# if existing, load M
# df_M = import_batch_parquet(batch_prefix='notebooks/twitter-news/M_')
engine.load_M(df_M=df_M, covariate_transformation='dot_product', cluster_M=True, quantize_M=True)

100%|██████████| 53300/53300 [10:15<00:00, 86.57it/s] 
  File "c:\Users\ardit\miniconda3\envs\SIMTAG\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
100%|██████████| 53300/53300 [10:03<00:00, 88.35it/s]


In [6]:
# if not existing, compute and save samples_encoded.parquet
sample_vectors = engine.encode_samples(sample_list) # already quantized
index_covariate = engine.compute_search_indexes(sample_vectors, k=20)

processing samples: 100%|██████████| 73137/73137 [00:57<00:00, 1277.37it/s]


In [8]:
query_tag_list = ['crypto', 'nft']
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_list=query_tag_list, allow_new_tags=True, print_new_tags=True, k=10)
search_results[0:5]

[['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftAnalysis'],
 ['nft', 'crypto'],
 ['nft', 'crypto'],
 ['CryptoNews', 'NFTs'],
 ['NFT', 'Crypto']]

In [26]:
# M_256 does not perform well on semantic-covariate encoding
indices, search_results = engine.semantic_covariate_search(index_covariate, sample_list, query="I want to buy crypto and nft", k=10)
for k in search_results[0:10]:
    print(k)

['CryptowithMC', 'Bitcoin', 'Crypto', 'CryptoNews', 'NFT']
['nft', 'crypto', 'bitcoin']
['NFT', 'Crypto']
['CryptoNews', 'NFTs']
['nft', 'crypto']
['NFT', 'crypto']
['nft', 'crypto']
['nft', 'crypto']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftReview']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftAnalysis']
