In [1]:
import pandas as pd
from pathlib import Path

import json

TWITTER_BASE = Path('/data/blockchain-interoperability/blockchain-social-media/twitter-data/')

ids = pd.read_pickle(TWITTER_BASE/'snapshots/id.pkl')
# timestamps = pd.to_datetime(pd.read_pickle(TWITTER_BASE/'snapshots/timestamp_ms.pkl'),unit='ms')
whole_text = pd.read_pickle(TWITTER_BASE/'snapshots/whole_text.pkl')

df = pd.concat([ids,whole_text],axis=1)

kmeans_clusters = json.load(open(TWITTER_BASE/'kmeans_clusters/kmeans_init_clusters.json'))
kmeans_clusters_sampled = {i:json.load(open(file)) for i,file in enumerate(sorted((TWITTER_BASE/'kmeans_clusters_resampled').glob('*.json')))}



In [70]:
for c_id, idxs in kmeans_clusters.items():
    print(c_id, len(idxs))

0 5478862
1 1118990
2 2543961
3 160048
4 5075115
5 596521


In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer,ENGLISH_STOP_WORDS
import numpy as np

custom_stop_words = list(ENGLISH_STOP_WORDS)+['http', 'https']

spam_snippet = [
    'uniswap is being exploited',
    '200k',
    '@nftssaga'
]
spam_filter = '|'.join([s.lower() for s in spam_snippet])

def get_cluster(cluster_id = 0):
    return df[
        df['id'].isin(kmeans_clusters_sampled[cluster_id])
    ]

def get_topics(cluster_id = 0):
    cluster_df = get_cluster(cluster_id)
    print(f'{len(cluster_df)} poitns in cluster')
    filtered_text = cluster_df[~cluster_df['whole_text'].str.lower().str.contains(spam_filter)]['whole_text'].values
    print(f'{len(filtered_text)} after filter')
    tfidf = TfidfVectorizer(
        stop_words=custom_stop_words, 
        max_features=1000,
    )
    tfidf_vecs = tfidf.fit_transform(filtered_text)
    tfidf_vocab = np.array(tfidf.get_feature_names_out())
    vocab_scores = tfidf_vecs.mean(0).A[0]
    return tfidf_vocab[vocab_scores.argsort()[::-1]]




In [68]:
for i in range(6):
    print(f'CLUSTER {i}')
    print('---------------------')
    topics = get_topics(i)
    print(topics[:100])

CLUSTER 0
---------------------
1000000 poitns in cluster
698730 after filter
['pump' 'just' 'signal' 'happen' 'event' 'crypto' 'wallstreetbets'
 'binance' 'kucoin' 'big' 'announced' 'massive' 'hours' 'whales' 'group'
 'project' 'owners' 'really' 'token' 'security' 'airdrop' 'days' 'planned'
 'join' 'awesome' 'check' 'telegram' 'blockchain' 'week' 'place' 'say'
 'amp' 'nft' 'decided' 'like' 'twitter' 'going' 'money' 'time' 'best'
 'bitcoin' 'don' 'roll' 'people' 'live' 'bsc' 'community' 'lol' 'ftx'
 'world' 'bnb' 'eth' 'good' 'bridge' 'market' 'follow' 'got' 'coming'
 'team' 'finally' 'organized' 'paid' 'make' 'btc' 'new' 'cz_binance' 'ive'
 'wallet' 'profited' 'ass' 'mentorship' 'know' 'hi' 'free' 'need'
 'uniswap' 'rent' 'months' 'great' 'moon' 'collective' 'thanks' 'program'
 'elonmusk' 'holding' 'trade' 'future' 'sea' 'declared' 'soon' 'solid'
 'exchange' 'love' '12h' 'definitely' 'maybe' 'couple' 'pumpers' 'channel'
 'status']
CLUSTER 1
---------------------
1000000 poitns in clus

Ku Coin Pump 


In [69]:
len(df)

14973497

In [66]:
cluster_id = 5

cluster_df = df[
    df['id'].isin(kmeans_clusters_sampled[cluster_id])
]


set(cluster_df[cluster_df['whole_text'].str.contains('nftssaga')]['whole_text'].values.tolist())

{'Promote it on @nftssaga',
 'Promote it on @nftssaga in',
 'Promote it on @nftssagaL',
 'Promote it on @nftssagaP',
 'Promote it on @nftssagal',
 'Promote it on @nftssagap'}

In [4]:
for cluster_id,(_,by_score) in topics.items():
    print(cluster_id, by_score[:100])

0 ['free', 'youre', 'great', 'event', 'exchange', 'ftx', 'fund', 'future', 'game', 'going', 'good', 'got', 'group', 'year', 'guy', 'ha', 'hack', 'happen', 'help', 'id', 'im', 'job', 'join', 'just', 'dont', 'doesnt', 'doe', 'didnt', 'airdrop', 'amp', 'asset', 'bad', 'bank', 'best', 'better', 'big', 'binance', 'bitcoin', 'blockchain', 'bridge', 'buy', 'check', 'coin', 'come', 'community', 'company', 'crypto', 'day', 'did', 'know', 'let', 'life', 'security', 'social', 'space', 'support', 'team', 'thats', 'thing', 'think', 'time', 'today', 'token', 'twitter', 'use', 'user', 'wa', 'wallet', 'want', 'way', 'week', 'whale', 'work', 'world', 'signal', 'say', 'like', 'roll', 'long', 'look', 'lot', 'love', 'make', 'market', 'money', 'national', 'need', 'new', 'news', 'nft', 'people', 'place', 'platform', 'price', 'project', 'pump', 'real', 'really', 'right', 'account']
1 ['binance', 'year', 'insight', 'ftx', 'future', 'going', 'good', 'got', 'guy', 'ha', 'hack', 'hour', 'im', 'just', 'eth', 'kno

**Cluster 0**: 
Keywords: FTX, exchange, going, good, positive sentiment?

**Cluster 1**: 
Keywords: Binance, FTX, Liquidity

**Cluster 2**: 
Keywords: ZK, FTX, Game

**Cluster 3**: 
Keywords: Security, Doge, XRP, indicators (investing related)

**Cluster 4**: 
Keywords: Group, Event, Community

**Cluster 5**: 
Keywords: Prediction, Price, Insight, Profit



In [1]:
import pandas as pd

ts = pd.read_pickle('/data/blockchain-interoperability/blockchain-social-media/twitter-data/snapshots/timestamp_ms.pkl')

In [8]:
str(pd.to_datetime(ts[0],unit='ms'))

'2022-11-11 12:03:53.965000'