In [1]:
import os
import warnings
import json
import numpy as np
import os

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd




In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

In [None]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    quantization='int8'
)

In [4]:
# M, valid_tags, pointers = engine.compute_optimal_M(verbose=True, percentile_threshold=95, n_clusters=1000, quantize_M=True)

# store pre-computed files
# engine.npy_save(M, 'notebooks/twitter-news/M_quantized')
# engine.json_save(pointers, 'notebooks/twitter-news/pointers')

# load pre-computed files
M = engine.npy_load('notebooks/twitter-news/M_quantized')
pointers = engine.json_load('notebooks/twitter-news/pointers')

In [5]:
engine.load_M(M, pointers, covariate_transformation='dot_product')

In [6]:
# prepare search
sample_vectors = engine.encode_samples(sample_list)
index_covariate = engine.compute_search_indexes(sample_vectors, k=10)

processing samples: 100%|██████████| 73137/73137 [01:17<00:00, 946.67it/s] 


In [7]:
query_tag_list = ['crypto', 'nft']
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_list=query_tag_list, allow_new_tags=True, print_new_tags=True, k=10)
for k in search_results[0:10]:
    print(k)

['nft', 'crypto']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftAnalysis']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftReview']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'BestNft']
['nft', 'crypto']
['nft', 'crypto']
['NFT', 'crypto']
['NFT', 'Crypto']
['CryptoNews', 'NFTs']
['Crypto', 'CryptoLegions', 'cryptocurrency', 'ETH', 'NFTProject', 'NFTCommumity']


In [8]:
# M_256 does not perform well on semantic-covariate encoding
indices, search_results = engine.semantic_covariate_search(index_covariate, sample_list, query="I want to buy crypto and nft", k=10)
for k in search_results[0:10]:
    print(k)

['cryptogaming', 'cryptosport', 'cryptosportgaming', 'BestNft']
['nft', 'crypto']
['nft', 'crypto']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftReview']
['NFT', 'Crypto']
['NFT', 'crypto']
['cryptogaming', 'cryptosport', 'cryptosportgaming', 'NftAnalysis']
['nft', 'crypto']
['NFT', 'OpenSea', 'Cryptocurrency']
['Ethereum', 'Ethereum', 'Cryptocurency', 'NFTs', 'trading']
