### Enter full names of group members:

##### Name A: Aaryan Neupane
##### Name B: Anne Torgersen

In [2]:
import math
import numpy as np
from sympy import prime
from pathlib import Path  # for paths of files
import csv
import copy
import random
from sklearn.metrics.pairwise import cosine_similarity

# ANSI escape codes for colors
class colors:
    red = '\033[91m'
    green = '\033[92m'
    blue = '\033[94m'
    end = '\033[0m'  

### 1. DGIM

#### 1.1. DGIM algorithm

In [3]:
# Default DGIM parameters

stream_path = 'data/my_stream.txt'

# The window size
N = 500 

In [4]:
# Helper functions

def shift_buckets(bucket_list):
    for i in range(len(bucket_list) - 1):
        if len(bucket_list[i]) > 2:
            prev_time = bucket_list[i].pop(0)
            penultimate_time = bucket_list[i].pop(0)
            bucket_list[i+1].append(penultimate_time)

def remove_expired(bucket_list, end_time, N):
    for bucket in bucket_list:
        for time_stamp in bucket:
            if (end_time - time_stamp) > N:
                bucket.remove(time_stamp)

In [5]:
def dgim_algorithm(stream_path, N):
    num_buckets = int(N).bit_length()
    bucket_list = [[] for _ in range(num_buckets)]
    time = 0
    with open(stream_path) as file:
        while True:
            bit = file.read(1)

            if not bit:
                break
                
            time += 1

            if bit == '1':
                bucket_list[0].append(time)
                shift_buckets(bucket_list)
                remove_expired(bucket_list, time, N)

    bucket_list = [[elem % N for elem in bucket] for bucket in bucket_list]
            
    end_time = max(bucket_list[0])
    
    return bucket_list, end_time


In [6]:
bucket = dgim_algorithm(stream_path, N)
print(f"The updated list of timestamps buckets from DGIM algorithm: \n {bucket[0]}")
print(f"The end timestamp: {bucket[1]}") 

The updated list of timestamps buckets from DGIM algorithm: 
 [[99], [91, 96], [83, 89], [63, 75], [44], [6], [321, 446], [188], []]
The end timestamp: 99


#### 1.2. Query the Bucket 

In [7]:
def actual_count(stream_path, k):
    stream_list = []
    with open(stream_path, 'r') as file:
        for line in file:
            stream_list.extend(list(map(int, line.strip())))

    # Convert the list into a numpy array
    stream_array = np.array(stream_list)
    
    return int(np.sum(stream_array[-k:]))

In [8]:
def dgim_query(bucket_list, N, k):  
    bucket_list, end_time_stamp = bucket_list
    
    boxes_to_check = 1
    prev_time = 0
    for bucket in bucket_list:
        if len(bucket) == 0:
            break
        if max(bucket) == end_time_stamp:
            continue
            
        time = abs(end_time_stamp - min(bucket))
        if time >= k:
            break
 
        prev_time = abs(end_time_stamp - min(bucket)) + prev_time
        
        if prev_time >= k:
            boxes_to_check += 1
            break
            
        else:
            boxes_to_check += 1
            continue
            
    one_count = 0
    for i in range(boxes_to_check):    
        if i == boxes_to_check:
            if len(bucket_list[i]) == 2:
                one_count += (1 * 2**i) + (1 * 2**i)//2
            elif len(bucket_list[i]) == 1:
                one_count += (1 * 2**i) //2
        one_count += len(bucket_list[i]) * 2**i

    return math.ceil(one_count)

In [9]:
# List of queries
K = [10, 50, 100, 200, 300, 400, 500] 

In [10]:
print("---------------------------------------------------------------")
for k in K:
    dgim_count = dgim_query(bucket, 500, k)
    true_count = actual_count(stream_path, k)
    
    print(f"The total 1s in the last {k} bits by DGIM: {dgim_count}")
    print(f"The true count of 1s in the last {k} bits: {true_count}")
    print(f"The DGIM error for predicted 1s in the last {k} bits: \
    {round(abs(100*(dgim_count-true_count))/true_count,2)} %")
    print("---------------------------------------------------------------")

---------------------------------------------------------------
The total 1s in the last 10 bits by DGIM: 5
The true count of 1s in the last 10 bits: 5
The DGIM error for predicted 1s in the last 10 bits:     0.0 %
---------------------------------------------------------------
The total 1s in the last 50 bits by DGIM: 29
The true count of 1s in the last 50 bits: 26
The DGIM error for predicted 1s in the last 50 bits:     11.54 %
---------------------------------------------------------------
The total 1s in the last 100 bits by DGIM: 45
The true count of 1s in the last 100 bits: 51
The DGIM error for predicted 1s in the last 100 bits:     11.76 %
---------------------------------------------------------------
The total 1s in the last 200 bits by DGIM: 77
The true count of 1s in the last 200 bits: 105
The DGIM error for predicted 1s in the last 200 bits:     26.67 %
---------------------------------------------------------------
The total 1s in the last 300 bits by DGIM: 205
The true c

### 2. Bloom filters

In [None]:
# Username data for the creation of bloom filters - B
data_file = (Path("data/bloom_username").with_suffix('.csv'))

# Test data to check the functionality and false positive rate
test1_file = (Path("data/test1_username").with_suffix('.csv'))
test2_file = (Path("data/test2_username").with_suffix('.csv'))

# Default bloom filter parameters
bloom_size = 1500000 # parameter N
h = 3 # number of hash functions

In [None]:
# create an array of bloom filter with zeros
B = np.zeros(bloom_size)

In [None]:
B

#### 2.1. Create Bloom filter

In [None]:
def generate_hash(h, N):
    hash_list = []
    
    # To-do! generate a list of hash functions
        
    return hash_list

In [None]:
hashes = generate_hash(h, bloom_size)

In [None]:
def create_bloom_filter(B, hashes, data):
    with data.open() as f:
        for name in f:
            
            # To-do! update the hash index of the bloom filter with 1s
            
    return B

In [None]:
bloom_array = create_bloom_filter(B, hashes, data_file)

In [None]:
bloom_array

#### 2.2. Verify usernames

In [None]:
def single_verify_username(bloom_array, hashes, new_user):
    
    # To-do! verify username and return a code of 0 or 1 (1 - username taken and 0 - username available)
        
    return code
    

In [None]:
# Feel free to test different usernames here

new_username = "KazeemTDT4305"

# new_username = "ShambaTDT4305"

In [None]:
user_code = single_verify_username(bloom_array, hashes, new_username)

In [None]:
if user_code == 1:
    print(colors.red + f"Username {new_username} has been taken. Try again!" + colors.end)
elif user_code == 0:
    print(colors.green + f"Username {new_username} is available. Congrats!" + colors.end)
else:
    print(colors.blue + f"Wrong pass code. Please reverify!" + colors.end)  

In [None]:
def group_verify_username(bloom_array, hashes, data):
    # Initialize counts
    total_name = 0
    taken_name = 0
    
    with data.open() as f:
        for name in f:
            # To-do! similar to the single verify, but returns a percentage of usernames taken...
            # ...(In other words seen already by the bloom filter during its creation)
            
    return round(taken_name/total_name*100,2)   

In [None]:
print("----------------------------------------------------------")
user_total = group_verify_username(bloom_array, hashes, test1_file)
print(f"Percentage of username seen before from test 1: {user_total}%")
print("----------------------------------------------------------")
user_total = group_verify_username(bloom_array, hashes, test2_file)
print(f"Percentage of username seen before from test 2: {user_total}%")
print("----------------------------------------------------------")

### 3. Flajolet-Martin

In [None]:
def flajolet_martin(input_stream):
    R = 0  # Initialize maximum rightmost zero bit position to 0

    # To-do! Define hash function h(x) = 6x + 1 mod 5
    

    # To-do! Iterate over the input stream and update maximum rightmost zero bit position
    

    # Estimate the number of distinct elements
    distinct_estimate = 2 ** R

    return distinct_estimate

In [None]:
# Input stream
input_stream1 = [1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1]
input_stream2 = [1, 3, 2, 1, 2, 3, 4, 3, 1, 2, 3, 1]

# Run the Flajolet-Martin algorithm
distinct_estimate1 = flajolet_martin(input_stream1)
distinct_estimate2 = flajolet_martin(input_stream2)

# Print the estimated number of distinct elements
print("-----------------------------------------------------")
print(f"Distinct elements (estimated) in input stream 1:", distinct_estimate1)
print("-----------------------------------------------------")
print(f"Distinct elements (estimated) in input stream 2:", distinct_estimate2)
print("-----------------------------------------------------")

### 4. Adword 

#### 4.1. Greedy Algorithm

In [None]:
# User queries
queries = ["big data", "big data", "big data","bloom filters", "bloom filters", "bloom filters",
           "flajolet martin", "flajolet martin", "flajolet martin", "dgim algorithm", "dgim algorithm", "dgim algorithm"]

In [None]:
# Company A B C and D keywords and budget $$$
global_companies = {
        'A': ["big data", "bloom filters", 3],
        'B': ["flajolet martin", 3],
        'C': ["flajolet martin", "dgim algorithm", 3],
        'D': ["big data", 3],
    }

In [None]:
def greedy_algorithm(local_companies, queries):
    # Initial revenue
    revenue = 0
    
    # To-do! update revenue using greedy algorithm
    
    return revenue

In [None]:
total_revenue = 0
total_trials = 10
print("Starting trials using Greedy Algorithm...")
print("------------------------------------------------")
for i in range(total_trials):
    local_companies = copy.deepcopy(global_companies)
    revenue = greedy_algorithm(local_companies, queries)
    total_revenue = total_revenue + revenue
    print(f"Trial {i+1} - Revenue generated: {revenue}")
print("------------------------------------------------")   
print("Average revenue generated for all trials: ",total_revenue/total_trials)

#### 4.2. Balance Algorithm

In [None]:
def balance_algorithm(local_companies, queries):
    # Initial revenue
    revenue = 0
    
    # To-do! update revenue using balance algorithm
    
    return revenue

In [None]:
total_revenue = 0
total_trials = 10
print("Starting trials using Balance Algorithm...")
print("-------------------------------------------")
for i in range(total_trials):
    local_companies = copy.deepcopy(global_companies)
    revenue = balance_algorithm(local_companies, queries)
    total_revenue = total_revenue + revenue
    print(f"Trial {i+1} - Revenue generated: {revenue}")
print("-------------------------------------------")   
print("Average revenue generated for all trials: ",total_revenue/total_trials)

### 5. Recommender System

In [11]:
# Ratings matrix (each row corresponds to a movie, and each column corresponds to a user)
ratings_matrix = np.array([
    [1, 0, 3, 0, 0, 5, 0, 0, 5, 0, 4, 0],
    [0, 0, 5, 4, 0, 0, 4, 0, 0, 2, 1, 3],
    [2, 4, 0, 1, 2, 0, 3, 0, 4, 3, 5, 0],
    [0, 2, 4, 0, 5, 0, 0, 4, 0, 0, 2, 0],
    [0, 0, 4, 3, 4, 2, 0, 0, 0, 0, 2, 5],
    [1, 0, 3, 0, 3, 0, 0, 2, 0, 0, 4, 0]
])

#### 5.1. User-User Collaborative Filtering

In [12]:
# Helper functions

def cos_sim(user1_v, user2_v):
    return np.dot(user1_v, user2_v) / (np.linalg.norm(user1_v) * np.linalg.norm(user2_v))
    
def get_top_indices(arr, x):
    sorted_indices = np.argsort(arr)
    # Find the highest indices, ignoring the first
    top_indices = sorted_indices[-(x+1):-1]
    return top_indices

In [13]:
def user_cf(rate_m, tup_mu, neigh):
    movie_id, user_id = tup_mu
    
    user_v = rate_m.T[user_id - 1]

    similarities = []

    for user in rate_m.T:
        similarities.append(cos_sim(user_v, user))

    N_users = get_top_indices(similarities, neigh)

    pred_numer = 0
    pred_denom = 0

    for user in N_users:
        # Weighted average
        pred_numer += similarities[user] * rate_m[movie_id - 1][user]
        pred_denom += similarities[user]

    prediction = round(pred_numer / pred_denom, 2)    
    
    return prediction

In [14]:
# List of tuple of movie rating by users to be predicted e.g (1, 5) refers to the rating of movie 1 by user 5
list_mu_query = [(1, 5), (3, 3)]

# Neighbor selection (|N|)
neigh = 2

In [15]:
print("-----------------------------------------------------------------")   
for mu_query in list_mu_query:
    predicted_rating = user_cf(ratings_matrix, mu_query, neigh)
    print(f"The predicted rating of movie {mu_query[0]} by user {mu_query[1]}: {predicted_rating} (User-User CF)")
    print("-----------------------------------------------------------------")   

-----------------------------------------------------------------
The predicted rating of movie 1 by user 5: 1.42 (User-User CF)
-----------------------------------------------------------------
The predicted rating of movie 3 by user 3: 1.49 (User-User CF)
-----------------------------------------------------------------


#### 5.2. Item-Item Collaborative Filtering

In [16]:
def item_cf(rate_m, tup_mu, neigh):
    movie_id, user_id = tup_mu

    movie_v = rate_m[movie_id - 1]
    user_v = rate_m.T[user_id - 1]

    similarities = []

    for movie in rate_m:
        similarities.append(cos_sim(movie_v, movie))

    N_movies = get_top_indices(similarities, neigh)

    pred_numer = 0
    pred_denom = 0

    for movie in N_movies:
        # Weighted average
        pred_numer += similarities[movie] * user_v[movie]
        pred_denom += similarities[movie]
        
    prediction = round(pred_numer / pred_denom, 2)
    
    return prediction

In [17]:
print("-----------------------------------------------------------------")   
for mu_query in list_mu_query:
    predicted_rating = item_cf(ratings_matrix, mu_query, neigh)
    print(f"The predicted rating of movie {mu_query[0]} by user {mu_query[1]}: {predicted_rating} (Item-Item CF)")
    print("-----------------------------------------------------------------")   

-----------------------------------------------------------------
The predicted rating of movie 1 by user 5: 2.48 (Item-Item CF)
-----------------------------------------------------------------
The predicted rating of movie 3 by user 3: 3.0 (Item-Item CF)
-----------------------------------------------------------------


### Provide concise answers to all 5 cases in the Project 3 description below

#### Case 1

The space complexity of DGIM is $O(\log^2 N)$ because each bucket can represent up to $\log N$ timestamps, resulting in a total of $log^2 N$ timestamps across all buckets. 


#### Case 2

In [None]:
# Enter answer here

#### Case 3

In [None]:
# Enter answer here

#### Case 4

In [None]:
# Enter answer here

#### Case 5

By analyzing the ratings matrix, it's clear that users 3 and 5 share similar tastes in the provided movies. Since user 3 rated movie 1 a 3, it is natural to assume that a prediction closer to 3 would be more accurate. Based on this intuition, item-item collaborative filtering seems like the better prediction.