https://www.kaggle.com/datasets/deeguy/twitter-news

In [1]:
import os

# move directory to the root of this repo
os.chdir('\\'.join(os.getcwd().split('\\')[:-2]))

import pandas as pd
import numpy as np
from simtag.filter import simtag_filter

  from tqdm.autonotebook import tqdm, trange


# setup the library

### import data

In [2]:
import pandas as pd
import ast

# tweets
df = pd.read_csv('notebooks/twitter-news/news_tweets.csv', lineterminator='\n')
df = df.dropna(subset='hashtags')
df = df.reset_index()
df['hashtags'] = df['hashtags'].apply(lambda x : ast.literal_eval(x))
sample_list = df['hashtags'].tolist()

# extract hashtags
hashtags_list = [x for x in df['hashtags'].dropna()]
hashtags = list()
[[hashtags.append(k) for k in x] for x in hashtags_list]
hashtags = list(set(hashtags))
hashtags = sorted(hashtags)

### process data

In [3]:
# initiate recommender
engine = simtag_filter(sample_list=sample_list, tag_list=hashtags)

In [None]:
# if the data has not been encoded, yet
# import pandas as pd
# from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# tqdm.pandas()

# model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') #all-MiniLM-L6-v2 #all-mpnet-base-v2

# #creates a new column with the encoded text
# df_tags = pd.DataFrame(hashtags)
# df_tags.columns = ['tags']
# df_tags['vector_tags'] = df_tags['tags'].progress_apply(lambda x : model.encode(x).tolist())
# df_tags.to_parquet('M.parquet', index=None)

In [4]:
# if existing, load M
df_M = pd.read_parquet('notebooks/twitter-news/M.parquet')
engine.load_M(df_M)
engine.apply_PCA(pca_vector_length=384)

In [5]:
# prepare search
# sample_vectors = engine.encode_samples(sample_list)
sample_vectors = pd.read_parquet('notebooks/twitter-news/df_encoded.parquet')['vector'].tolist()
nbrs = engine.compute_nbrs(sample_vectors, k=5)

# semantic tag search

### naive

In [6]:
query_tag_list = ['trump', 'News', 'democracy']

# perform search
query_vector = engine.encode_query(list_tags=query_tag_list)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results

[['news', 'Trump'],
 ['trump', 'news'],
 ['trump', 'news'],
 ['Politics', 'Trump', 'News'],
 ['trump', 'democrats', 'news']]

### weighted

In [7]:
query_tag_dict = {
    'trump' : 0.3,
    'democracy' : 0.7,
    'putin' : 0.5
}

# perform search
query_vector = engine.encode_query(dict_tags=query_tag_dict)
indices, search_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
search_results[0:5]

[['Democracy'], ['Democracy'], ['Putin'], ['putin'], ['Democratic']]

# validation

In [8]:
query_tag_list = ['democracy', 'trump']
result_index = 0

# semantic search
query_vector = engine.encode_query(list_tags=query_tag_list)
soft_indices, soft_filter_results = engine.soft_tag_filtering(nbrs, sample_list, query_vector)
soft_raw_scores, soft_mean_scores = engine.compute_neighbor_scores(soft_filter_results[result_index], query_tag_list, remove_max=False)

# traditional search
hard_indices, hard_filter_results = engine.hard_tag_filtering(sample_list, query_tag_list)
hard_raw_scores, hard_mean_scores = engine.compute_neighbor_scores(hard_filter_results[result_index], query_tag_list, remove_max=False)

### lookup data

In [9]:
# semantic tag filtering
df.iloc[soft_indices]

Unnamed: 0,index,content,date,hashtags,likeCount,quoteCount,replyCount,retweetCount,retweetedTweet,sourceLabel,username,country,country_cd,quoted_content
32270,156660,The World Bank are opressing Fox News viewers ...,2022-08-13T22:16:28+00:00,[Trump],0,0,0,0,,ArseniKarp,ArseniKarp,,,
70021,354702,Hannity you owe President Obama an apology for...,2022-08-13T04:10:15+00:00,[Trump],1,0,0,1,,Twitter for iPhone,WheresPercy,,,
32020,154630,Una situazione complessa e inedita BBC News - ...,2022-08-13T22:29:54+00:00,[Trump],3,0,0,1,,Twitter for Android,paoloigna1,,,
36704,186879,@newsmax @foxnews\n\nThe real news.\n\nNO ONE ...,2022-08-13T19:12:59+00:00,[TRUMP],0,0,0,0,,Twitter for Android,RRobocaller,,,The National Archives maintains millions of un...
45375,240066,This is the @MerriamWebster definition of 'sta...,2022-08-13T15:01:50+00:00,[Trump],0,0,0,0,,Twitter Web App,RonnieLouise2,,,


In [10]:
# traditional tag filtering
df.iloc[hard_indices]

Unnamed: 0,index,content,date,hashtags,likeCount,quoteCount,replyCount,retweetCount,retweetedTweet,sourceLabel,username,country,country_cd,quoted_content
1,2,@davidmweissman @RonFilipkowski I just hope th...,2022-08-14T15:51:17+00:00,"[trump, truth, RuleOfLaw, democracy]",0,0,0,0,,Twitter for iPhone,Deirdre62823246,,,


### visualize flattened results

In [11]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='mean', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)

### visualize granular results

In [13]:
engine.show_results(query_tag_list, soft_raw_scores, soft_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)
print()
engine.show_results(query_tag_list, hard_raw_scores, hard_filter_results[result_index], visualization_type='raw', power=0.4, title=f'{query_tag_list}', visualize=True, return_html=False)


