In [1]:
import requests
import json
from bitarray import bitarray
from decimal import Decimal #To avoid overflowing for very large numbers
import numpy as np
from time import time

In [2]:
def count_trail_zeroes(d):
    """Count the number of leading and trailing zeroes in an integer."""
    b = "{:064b}".format(d)
    try:
        return 63 - b.rindex("1")
    except ValueError:  # "stubsting not found" error - did not find any '1' in the string
        return 64

In [5]:
#Initialization
largePrime = 1003162753
users_N_zeros = 0 #Number of leading zeros in Flajolet-Martin Approach
#a and b for hashes in Flajolet-Martin Approach
usr_r1 = np.random.randint(largePrime)
usr_r2 = np.random.randint(largePrime)

#Bloom filter
N_bloom = 10000
usr_hashes = 4 #Number of hash functions to be used
usr_rs = np.random.randint(N_bloom, size=(usr_hashes, 2))


#Filtering the most popular topics with exponentially decaying windows
topics_c = 1 - 1E-2 # 1 - c
topics_count = {}
topic_threshold = 0.9 #Treshold of the filter

running_time = 60 * 60 # in minutes

iter_report = 500 #Iteration of reporting frequency

In [6]:
B = bitarray(N_bloom)
B.setall(0)

rq = requests.get('http://stream.meetup.com/2/rsvps', stream=True)
start_time = time()
iters = 0
for raw_rsvp in rq.iter_lines():
    try:
        rsvp = json.loads(raw_rsvp)
        member_id = rsvp[u'member'][u'member_id']
        member_hash = hash(member_id)
        
        #Flajolet-Martin
        user_hash = (member_hash * usr_r1 + usr_r2) % largePrime
        users_N_zeros = max(users_N_zeros, count_trail_zeroes(user_hash))
        
        #Bloom filter for identifying recurring users
        for r in range(usr_hashes):
            hash_i = int(((Decimal(member_hash)*usr_rs[r, 0] + usr_rs[r, 1]) % largePrime)) % N_bloom
            B[hash_i] = 1
        
        #Finding the most popular topics
        topics = [tp[u'topic_name'] for tp in rsvp[u'group'][u'group_topics']]
        for topic in topics:    
            items_to_drop = []
            for k in topics_count:
                topics_count[k] *= topics_c
                #If an item's weight below the threshold it will be dropped from the list
                if topics_count[k] < topic_threshold: 
                    items_to_drop.append(k)
            for item in items_to_drop:
                topics_count.pop(item, None)

            if topic in topics_count:
                topics_count[topic] += 1
            else:
                topics_count[topic] = 1
                
        iters += 1
        if iters % iter_report == 0:
            hours, remainder = divmod(passed_time, 3600)
            minutes, seconds = divmod(remainder, 60) 
            print("{} items have been received in {:.0f}h {:.0f}m {:.0f}s".format(iters, hours, minutes, seconds))
    except:
        pass
    passed_time = time() - start_time
    if passed_time > running_time:
        break
print("stream has been stopped")


#Report
TopN = 10
most_frequent_TopN = sorted(topics_count.items(), key=lambda k: -k[1])[:TopN]
for k, v in most_frequent_TopN:
    print("{} (weight) for: {}".format(v, k))

    
print("Approximately {} (2**{}) unique users have responded an RSVP".format(2**(users_N_zeros), users_N_zeros))


# Check recurring users
rq = requests.get('http://stream.meetup.com/2/rsvps', stream=True)
start_time = time()
iters = 0
recurring_users = 0
users_count = 0
for raw_rsvp in rq.iter_lines():
    try:
        rsvp = json.loads(raw_rsvp)
        member_id = rsvp[u'member'][u'member_id']
        member_hash = hash(member_id)
        users_count += 1
        
        matched_hashes = 0
        for r in range(usr_hashes):
            hash_i = int(((Decimal(member_hash)*usr_rs[r, 0] + usr_rs[r, 1]) % largePrime)) % N_bloom
            if B[hash_i] == 0:
                continue
            matched_hashes += 1
        if matched_hashes == usr_hashes:
            recurring_users += 1
    except:
        pass
    passed_time = time() - start_time
    if passed_time > running_time:
        break
print("stream has been stopped")

#Rerport recurring users
print("{} out of {} users have been made RSVPs in a row ({:.4f})".format(recurring_users, users_count, (recurring_users/users_count)))

500 items have been received in 0h 6m 3s
1000 items have been received in 0h 10m 32s
1500 items have been received in 0h 16m 18s
2000 items have been received in 0h 22m 3s
2500 items have been received in 0h 28m 0s
3000 items have been received in 0h 34m 12s
3500 items have been received in 0h 38m 46s
4000 items have been received in 0h 44m 47s
4500 items have been received in 0h 50m 47s
5000 items have been received in 0h 56m 23s
stream has been stopped
1 (weight) for: Latino Culture
0.99 (weight) for: Salsa Dance Lessons
0.9801 (weight) for: Dance and Movement
0.9702989999999999 (weight) for: Exercise
0.96059601 (weight) for: Cultural Activities
0.9509900498999999 (weight) for: Dance Lessons
0.9414801494009999 (weight) for: Dance Fitness
0.9320653479069899 (weight) for: Latin Dance
0.92274469442792 (weight) for: Dancing
0.9135172474836407 (weight) for: Women's Fitness
Approximately 1024 (2**10) unique users have responded an RSVP
stream has been stopped
1950 out of 5837 users have be