In [50]:
import json
import os
import time
from random import shuffle
from sseclient import SSEClient as EventSource

In [47]:
def generate_wiki_changes_dataset(maxEvents=5000):
    print('Will generate records for wiki changes dataset ...')
    url = 'https://stream.wikimedia.org/v2/stream/recentchange'
    counter = 0 
    data = []
    t1 = time.time()

    for event in EventSource(url):
        if event.event == 'message':
            try:
                change = json.loads(event.data)
            except ValueError:
                continue
                
            counter += 1 
            if counter > maxEvents:
                break
            
            data.append(change)

    t2 = time.time()
    print(f'Took {(t2-t1)//60} mins to generate {maxEvents} records.')
    return data

def generate_or_load_black_list(force=False, dataset=None, n=100):
    black_list = None
    black_list_temp_file = 'black_list_temp.json'

    if not os.path.exists(black_list_temp_file) or force:
        dataset = dataset.copy() or generate_wiki_changes_dataset(n)
        black_list = list(set([c['user'] for c in dataset if c['bot']]))
        with open(black_list_temp_file, 'w') as output_file:
            output_file.write(json.dumps(black_list))
    else:
        print('File exists, loading...')
        with open(black_list_temp_file) as input_file:
            black_list = json.load(input_file)
    return black_list

In [52]:
# dataset = generate_wiki_changes_dataset()
black_list = generate_or_load_black_list(force=True, dataset=dataset)

In [53]:
dataset[0]

{'$schema': '/mediawiki/recentchange/1.0.0',
 'meta': {'uri': 'https://ceb.wikipedia.org/wiki/Kategoriya:Abungawg-uhong',
  'request_id': 'e444560e-95da-4be4-901d-29cfc9d26dea',
  'id': 'ab536fa3-5820-4e8e-88ae-3f2c7715ec8c',
  'dt': '2021-10-05T18:20:56Z',
  'domain': 'ceb.wikipedia.org',
  'stream': 'mediawiki.recentchange',
  'topic': 'eqiad.mediawiki.recentchange',
  'partition': 0,
  'offset': 3346060393},
 'id': 109444152,
 'type': 'categorize',
 'namespace': 14,
 'title': 'Kategoriya:Abungawg-uhong',
 'comment': '[[:Septoria exotica]] added to category',
 'timestamp': 1633458056,
 'user': 'Lsjbot',
 'bot': True,
 'server_url': 'https://ceb.wikipedia.org',
 'server_name': 'ceb.wikipedia.org',
 'server_script_path': '/w',
 'wiki': 'cebwiki',
 'parsedcomment': '<a href="/wiki/Septoria_exotica" title="Septoria exotica">Septoria exotica</a> added to category'}

In [54]:
len(dataset), len(black_list)

(5001, 43)

In [55]:
black_list[:10]

['Dexbot',
 '*Treker',
 'GeographBot',
 'Citation bot',
 'BjornNbot',
 'SDZeroBot',
 'Edoderoobot',
 'DeltaBot',
 'DerbethBot',
 'RobokoBot']

In [58]:
from bloom_filter import BloomFilter
from random import shuffle
 
n = len(black_list)  # no of items to add
p = 0.1  # false positive probability
 
bloom_filter = BloomFilter(n, p)

for item in black_list:
    bloom_filter.add(item)

bloom_filter.size, bloom_filter.hash_count

(206, 3)

In [66]:
white_list = list(set([c['user'] for c in dataset[:20] if not c['bot']]))
test_words = black_list[:10] + white_list
shuffle(test_words)
black_list[:10]

['Dexbot',
 '*Treker',
 'GeographBot',
 'Citation bot',
 'BjornNbot',
 'SDZeroBot',
 'Edoderoobot',
 'DeltaBot',
 'DerbethBot',
 'RobokoBot']

In [69]:
for word in test_words:
    if bloom_filter.check(word):
        if word in black_list:
            print(f"+ '{word}' is probably present!")
        else:
            print(f"? '{word}' is a false positive!")
    else:
        print(f"- '{word}' is definitely NOT present!")

- 'Joshua06' is definitely NOT present!
- 'Rwzi' is definitely NOT present!
+ 'DerbethBot' is probably present!
+ 'SDZeroBot' is probably present!
- 'Slava Sahakyan70' is definitely NOT present!
+ 'GeographBot' is probably present!
+ 'Dexbot' is probably present!
- 'Ixfd64' is definitely NOT present!
+ '*Treker' is probably present!
- 'GrawLIN' is definitely NOT present!
- 'RoBri' is definitely NOT present!
+ 'DeltaBot' is probably present!
- 'Ukulelevillain' is definitely NOT present!
+ 'Citation bot' is probably present!
- 'Lombroso' is definitely NOT present!
+ 'BjornNbot' is probably present!
+ 'Edoderoobot' is probably present!
- 'Steenth' is definitely NOT present!
- 'Skim' is definitely NOT present!
+ 'RobokoBot' is probably present!
