https://www.kaggle.com/datasets/deeguy/twitter-news

In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd

def import_batch_parquet(batch_prefix):
	dfs = [pd.read_parquet(f'{batch_prefix}{i}.parquet') for i in range(5)]
	df = pd.concat(dfs, ignore_index=True)
	return df

def store_batch_parquet(df, batch_prefix):
	chunk_size = len(df) // 5
	for i in range(5):
		start = i * chunk_size
		end = (i + 1) * chunk_size if i < 4 else len(df)
		df.iloc[start:end].to_parquet(f"{batch_prefix}{i}.parquet", engine='pyarrow')




# setup the library

### import data

In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

### process data

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list,
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    quantization='int8'
)

In [4]:
# if not existing, compute and save M
# M, df_M = engine.compute_M(method='encoding')
# store as batches, you can store as a unique file if you wish
# store_batch_parquet(df_M, 'notebooks/twitter-news/M_')

# if existing, load M
df_M = import_batch_parquet(batch_prefix='notebooks/twitter-news/M_')

# dot_product

In [5]:
engine.load_M(df_M, covariate_transformation='dot_product', quantize_M=False)

In [6]:
# if not existing, compute and save samples_encoded.parquet
# sample_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([sample_vectors], index=['vector']).T
# store_batch_parquet(samples_encoded, 'notebooks/twitter-news/quantized_samples_encoded_')

# if already existing, load samples_encoded.parquet
samples_encoded = import_batch_parquet(batch_prefix='notebooks/twitter-news/samples_encoded_')
sample_vectors = samples_encoded.vector.tolist()
index_covariate = engine.compute_search_indexes(sample_vectors, k=20)

### covariate search

In [32]:
query_tag_list = [ 'politics', 'democracy' , 'Trump']
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_list=query_tag_list, allow_new_tags=True, print_new_tags=True)
search_results

[['politics', 'trump', 'corruption'],
 ['Politics', 'Political', 'Trump', 'Biden', 'FBI'],
 ['Politics', 'Trump', 'News'],
 ['News', 'Raid', 'Biden', 'Trump', 'politics', 'government'],
 ['News', 'Politics', 'Government', 'Media', 'Trump', 'Biden'],
 ['politics', 'government', 'Biden', 'Trump', 'News', 'Media'],
 ['Trump', 'news', 'government', 'politics', 'media', 'Biden'],
 ['politics',
  'news',
  'economics',
  'republican',
  'covid',
  'government',
  'finance',
  'vote',
  'liberal',
  'democrats',
  'investment',
  'political',
  'Trump',
  'Biden'],
 ['politics',
  'news',
  'economics',
  'republican',
  'covid',
  'government',
  'finance',
  'vote',
  'liberal',
  'democrats',
  'investment',
  'political',
  'Trump',
  'Biden'],
 ['politics',
  'news',
  'economics',
  'republican',
  'covid',
  'government',
  'finance',
  'vote',
  'liberal',
  'democrats',
  'investment',
  'political',
  'Trump',
  'Biden'],
 ['politics',
  'news',
  'economics',
  'republican',
  'cov

### weighted covariate search

In [33]:
query_tag_dict = {
    'trump' : 1,
    'democracy' : 0.4,
    'republicans.' : 2
}
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_dict=query_tag_dict, allow_new_tags=True, print_new_tags=True)
search_results

republicans. -> REPUBLICANS


[['trump', 'gop', 'republicans'],
 ['Republicans'],
 ['Republicans'],
 ['MAGA', 'REPUBLICANS', 'GOP', 'TRUMP'],
 ['Republicans', 'Democrat'],
 ['republicans', 'Trump', 'Biden'],
 ['Republicans', 'GOP', 'FBI', 'Democratic'],
 ['Blacks', 'Democrats', 'Republicans', 'Independents', 'politics', 'news'],
 ['Trump', 'Conservatives', 'Republicans', 'news', 'media', 'America'],
 ['GOP', 'Republicans'],
 ['Republican'],
 ['Republican'],
 ['POTUS', 'Republicans', 'Trump', 'News'],
 ['POTUS', 'Republicans', 'Trump', 'News'],
 ['Republicans', 'impeach', 'Trump', 'LizCheney', 'POLITICO', 'election'],
 ['Republicans', 'Trump', 'GOP', 'news', 'FBILEAKS', 'VoteThemOut'],
 ['Politics',
  'news',
  'usa',
  'republicans',
  'democrats',
  'gop',
  'dnc',
  'insurrection',
  'sedition',
  'traitors'],
 ['politics',
  'news',
  'usa',
  'republicans',
  'democrats',
  'gop',
  'dnc',
  'insurrection',
  'sedition',
  'traitors'],
 ['trump', 'democrats', 'news'],
 ['DonaldJTrump', 'Republicans']]

# PCA

In [15]:
engine.load_M(df_M, covariate_transformation='PCA', quantize_M=False)

In [19]:
# if not existing, compute and save samples_encoded.parquet
# sample_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([sample_vectors], index=['vector']).T
# store_batch_parquet(samples_encoded, 'notebooks/twitter-news/samples_encoded_PCA_')

# if already existing, load samples_encoded.parquet
samples_encoded = import_batch_parquet(batch_prefix='notebooks/twitter-news/samples_encoded_PCA_')
sample_vectors = samples_encoded.vector.tolist()
index_covariate = engine.compute_search_indexes(sample_vectors, k=20)

### covariate search

In [20]:
query_tag_list = [ 'sports', 'nba' ]
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_list=query_tag_list, allow_new_tags=True, print_new_tags=True, k=10)
search_results

[['NBA', 'Basketball'],
 ['NBA', 'basketball'],
 ['NBA'],
 ['Sports'],
 ['Sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports'],
 ['nba', 'news', 'sports']]

### weighted covariate search

In [21]:
query_tag_dict = {
    'trump' : 1,
    'democracy' : 0.4,
    'republicans.' : 2
}
indices, search_results = engine.covariate_search(index_covariate, sample_list, query_tag_dict=query_tag_dict, allow_new_tags=True, print_new_tags=True, k=10)
search_results

republicans. -> REPUBLICANS


[['trump', 'gop', 'republicans'],
 ['Republicans', 'Democrat'],
 ['republicans', 'Trump', 'Biden'],
 ['MAGA', 'REPUBLICANS', 'GOP', 'TRUMP'],
 ['GOP', 'Republicans'],
 ['Republicans'],
 ['Republicans'],
 ['POTUS', 'Republicans', 'Trump', 'News'],
 ['POTUS', 'Republicans', 'Trump', 'News'],
 ['trump', 'democrats', 'news'],
 ['DonaldJTrump', 'Republicans'],
 ['Republicans', 'GOP', 'FBI', 'Democratic'],
 ['Trump', 'TrumpTreason', 'TrumpRaid', 'Republicans'],
 ['Republicans', 'Trump', 'BREAKING', 'news'],
 ['FBI', 'Republicans'],
 ['Republicans', 'Trump', 'GOP', 'news', 'FBILEAKS', 'VoteThemOut'],
 ['Trump', 'Conservatives', 'Republicans', 'news', 'media', 'America'],
 ['Republicans', 'MAGAMORONS'],
 ['Republican',
  'RepublicansAreDestroyingAmerica',
  'Garland',
  'Trump',
  'NoOneIsAboveTheLaw'],
 ['CNN', 'Republicans']]

# semantic_search

In [24]:
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import pandas as pd
tqdm.pandas()

# df_samples = pd.DataFrame([sample_list], columns=['sample']).T
# df_samples['vector'] = df_samples[0].progress_apply(lambda x : engine.model.encode(x)[0])
# df_samples.to_parquet('notebooks/twitter-news/samples_regular_encoded.parquet', index=None)

In [25]:
from sentence_transformers.quantization import quantize_embeddings
from sklearn.neighbors import NearestNeighbors

# df_samples = pd.read_parquet('notebooks/twitter-news/samples_regular_encoded.parquet')
# quantized_samples = quantize_embeddings(df_samples['vector'].values.tolist(), precision="int8")

# nbrs_semantic = NearestNeighbors(n_neighbors=10, metric='cosine').fit(df_samples['vector'].values.tolist())

### regular semantic search
We use a regular sentence as an input

In [84]:
# distances, indices = nbrs_semantic.kneighbors([engine.model.encode("What are the news from the white house?")])
# indices = indices[0].tolist()
# [sample_list[x] for x in indices][0:10]

[['WhiteHouse'],
 ['news', 'headlines'],
 ['news', 'celebrity'],
 ['News', 'Greece'],
 ['news', 'headlines'],
 ['News', 'Greece'],
 ['News', 'Greece'],
 ['News', 'Television'],
 ['news', 'stocks'],
 ['news', 'football']]

### covariate-semantic search

We use a regular sentence as an input, but on top of the covariate encoded vectors

In [8]:
indicies, search_results = engine.semantic_covariate_search(index_covariate, sample_list, query="What are the news from the whitehouse?")
search_results[0:5]

[['POTUS', 'WhiteHouse', 'News'],
 ['FoxNews', 'news'],
 ['politicsnews', 'politicalnews', 'news'],
 ['WhiteHouse'],
 ['Breitbart', 'News', 'BreitbartNews']]

### semantic search with tag query

We conver the query_tag_list into a string: as we can see, the queries have too much noise compared to covariate search

In [42]:
# query_tag_list = [ 'news', 'democrats', 'republicans' ]

# # perform search
# query_vector = engine.model.encode(str(query_tag_list))
# indices, search_results = engine.soft_tag_filtering(nbrs_semantic, sample_list, query_vector)
# search_results[0:4]

[['democrats', 'MAGA', 'maga', 'midterm', 'BLM'],
 ['Democrats', 'crime', 'police', 'Hollywood'],
 ['democrats',
  'inflation',
  'reelection',
  'Republicans',
  'recession',
  'news',
  'newsflash',
  'latestupdates'],
 ['Democrats',
  'VoteBlue',
  'VoteBlueIn2022',
  'VoteBlueToProtectOurRights',
  'VoteBlueOrWeAreScrewed']]