https://www.kaggle.com/datasets/deeguy/twitter-news

In [1]:
import os
import warnings

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))
warnings.simplefilter("ignore")

from simtag.filter import simtag_filter
import pandas as pd

def import_batch_parquet(batch_prefix):
	dfs = [pd.read_parquet(f'{batch_prefix}{i}.parquet') for i in range(5)]
	df = pd.concat(dfs, ignore_index=True)
	return df

def store_batch_parquet(df, batch_prefix):
	chunk_size = len(df) // 5
	for i in range(5):
		start = i * chunk_size
		end = (i + 1) * chunk_size if i < 4 else len(df)
		df.iloc[start:end].to_parquet(f"{batch_prefix}{i}.parquet", engine='pyarrow')

# setup the library

### import data

In [2]:
import pandas as pd
import ast

# list of tweets, already formatted in parquet format for easy loading
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

### process data

In [3]:
# initiate recommender
engine = simtag_filter(
    sample_list=sample_list, 
    covariate_vector_length=384,
    model_name='sentence-transformers/all-MiniLM-L6-v2'
)

In [4]:
# if not existing, compute and save M
# M, df_M = engine.compute_M(method='encoding')
# store as batches, you can store as a unique file if you wish
# store_batch_parquet(df_M, 'notebooks/twitter-news/M_')

# if existing, load M
df_M = import_batch_parquet(batch_prefix='notebooks/twitter-news/M_')

engine.load_M(df_M)

In [5]:
# if not existing, compute and save samples_encoded.parquet
# samples_vectors = engine.encode_samples(sample_list)
# samples_encoded = pd.DataFrame([[x[0] for x in samples_vectors]], index=['vector']).T
# store_batch_parquet(samples_encoded, 'notebooks/twitter-news/samples_encoded_')

# if already existing, load samples_encoded.parquet
samples_encoded = import_batch_parquet(batch_prefix='notebooks/twitter-news/samples_encoded_')
sample_vectors = samples_encoded.vector.tolist()
nbrs = engine.compute_nbrs(sample_vectors, k=4)

# semantic tag search

### naive

In [6]:
query_tag_list = [
    'trump', 
    'News', 
    'democracy'
]

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list, allow_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results

[['news', 'Trump'],
 ['trump', 'news'],
 ['trump', 'news'],
 ['Politics', 'Trump', 'News']]

### weighted

In [16]:
query_tag_dict = {
    'trump' : 1,
    'democracy' : 1,
	'democrats.' : 1,
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict, allow_new_tags=True, print_new_tags=True)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0:5]

democrats. -> Democrats


[['trump', 'democrats', 'news'],
 ['democrats'],
 ['Democrats'],
 ['DemocracyNotAutocracy', 'Democrat']]

In [8]:
query_tag_dict = {
    'trump' : 1,
    'democracy' : 0.4,
    'republican' : 2
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0:5]

[['Republicans', 'Democrat'],
 ['trump', 'gop', 'republicans'],
 ['republicans', 'Trump', 'Biden'],
 ['conservative', 'conservativenews', 'trump', 'republican']]

# validation

In [9]:
query_tag_list = ['democracy', 'trump']
result_index = 0

# semantic search
query_vector = engine.encode_query(list_tags=query_tag_list)
soft_indices, soft_filter_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = engine.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, remove_max=False)

# traditional search
hard_indices, hard_filter_results = engine.hard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = engine.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

### lookup data

In [10]:
# semantic tag filtering
df.iloc[soft_indices]

Unnamed: 0,index,content,date,hashtags,likeCount,quoteCount,replyCount,retweetCount,retweetedTweet,sourceLabel,username,country,country_cd,quoted_content
32270,156660,The World Bank are opressing Fox News viewers ...,2022-08-13T22:16:28+00:00,[Trump],0,0,0,0,,ArseniKarp,ArseniKarp,,,
70021,354702,Hannity you owe President Obama an apology for...,2022-08-13T04:10:15+00:00,[Trump],1,0,0,1,,Twitter for iPhone,WheresPercy,,,
32020,154630,Una situazione complessa e inedita BBC News - ...,2022-08-13T22:29:54+00:00,[Trump],3,0,0,1,,Twitter for Android,paoloigna1,,,
36704,186879,@newsmax @foxnews\n\nThe real news.\n\nNO ONE ...,2022-08-13T19:12:59+00:00,[TRUMP],0,0,0,0,,Twitter for Android,RRobocaller,,,The National Archives maintains millions of un...


In [15]:
# traditional tag filtering
df.iloc[hard_indices]

Unnamed: 0,index,content,date,hashtags,likeCount,quoteCount,replyCount,retweetCount,retweetedTweet,sourceLabel,username,country,country_cd,quoted_content
1,2,@davidmweissman @RonFilipkowski I just hope th...,2022-08-14T15:51:17+00:00,"[trump, truth, RuleOfLaw, democracy]",0,0,0,0,,Twitter for iPhone,Deirdre62823246,,,


### visualize flattened results

In [12]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)

### visualize granular results

In [14]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
print()
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)


