### Loading Data

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from operator import add

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark_Processor") \
    .master("local[*]") \
    .getOrCreate()

sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/10 03:44:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_df = spark.read.option('header', 'true').csv('MDA2022ProjectData.csv').sample(fraction=0.03)

In [4]:
# a preview of the data
data_df.show()

+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|DEVICE_CODE|SYSTEM_ID|ORIGINE_CAR_KEY|FINAL_CAR_KEY|CHECK_STATUS_KEY|COMPANY_ID|      PASS_DAY_TIME|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|   22010039|      284|       63562975|     64111706|               7|       161|2021-12-22 00:27:32|
|   22010078|      284|       63474042|     64111706|               7|       161|2021-12-22 01:31:40|
|   22010057|      284|       63069449|     63069449|               6|       161|2021-12-22 01:07:24|
|   22010059|      284|       35168430|     35168430|               6|       161|2021-12-22 01:23:28|
|   22010054|      284|       35299555|     64111706|               7|       161|2021-12-22 00:46:34|
|   22010139|      284|       46152557|     46152557|               6|       264|2021-12-22 00:16:44|
|   22010055|      284|       46352457|     46352457|               6|       161|2

In [5]:
# deleting outliers
data_df = data_df.filter(data_df['ORIGINE_CAR_KEY'] == data_df['FINAL_CAR_KEY'])
data_df = data_df.select(['DEVICE_CODE', 'ORIGINE_CAR_KEY', 'PASS_DAY_TIME'])
data = data_df.rdd

In [6]:
# a function to generate the required rdd
def create_key_value(row):
    key = (row['ORIGINE_CAR_KEY'], row['PASS_DAY_TIME'])
    value = row['DEVICE_CODE']
    return (key, value)

In [7]:
# creating the required rdd
data = data.map(create_key_value)
data.take(20)

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

[(('63069449', '2021-12-22 01:07:24'), '22010057'),
 (('35168430', '2021-12-22 01:23:28'), '22010059'),
 (('46152557', '2021-12-22 00:16:44'), '22010139'),
 (('46352457', '2021-12-22 00:18:20'), '22010055'),
 (('47778574', '2021-12-22 01:39:29'), '22010040'),
 (('47689247', '2021-12-22 01:35:23'), '22010039'),
 (('63691913', '2021-12-22 00:49:15'), '22010052'),
 (('63213660', '2021-12-22 01:23:46'), '100701144'),
 (('62838210', '2021-12-22 00:47:26'), '22010039'),
 (('64965848', '2021-12-22 01:30:32'), '22010054'),
 (('35887274', '2021-12-22 01:03:58'), '22010079'),
 (('36067843', '2021-12-22 01:40:28'), '22010039'),
 (('51891894', '2021-12-22 00:54:00'), '22010048'),
 (('49402571', '2021-12-22 01:15:49'), '22010053'),
 (('51643265', '2021-12-22 00:34:05'), '22010047'),
 (('64414455', '2021-12-22 00:37:39'), '22010054'),
 (('52102190', '2021-12-22 00:47:03'), '100701119'),
 (('51734068', '2021-12-22 00:59:22'), '22010039'),
 (('49552551', '2021-12-22 00:23:18'), '22010053'),
 (('500150

In [8]:
# cleaning the data: for some cameras, a car had been submited mutiple time in a single moment
# which is not rational. so, we will remove this redundancy
def day_time(basket):
    plate = basket[0][0]
    day_time = basket[0][1]
    camera_code = basket[1]
    day, time = day_time.split()
    time = time[:-3]
    return ((plate, day + " " + time), camera_code)

data_modified = data.map(day_time)
clean_data = data_modified.groupByKey().mapValues(lambda x: tuple(set(tuple(x))))

In [9]:
# removing time, leaving only day
def day(basket):
    plate = basket[0][0]
    day_time = basket[0][1]
    camera_codes = basket[1]
    day = day_time.split()[0]
    return ((plate, day), camera_codes)

def flatten(x):
    if isinstance(x, str):
        yield x
    else:
        for item in x:
            yield from flatten(item)

clean_data = clean_data.map(day)
clean_data = clean_data.groupByKey().mapValues(tuple).mapValues(lambda x: tuple(flatten(x)))
camera_baskets = clean_data.values()

## A-Priori Algorithm

In [10]:
# counting the visits to each camera
def extract_cameras(basket):
    items = tuple((camera, 1) for camera in basket)
    return items

camera_count = camera_baskets.flatMap(extract_cameras).reduceByKey(add)

In [11]:
# calculating support threshold. we set it equal to mean + 0.4 * std
mean_camera_count = camera_count.values().mean()
std_camera_count = camera_count.values().stdev()
SUPPORT_S =  mean_camera_count +  0.4 * std_camera_count
print(f'mean camera count:    {mean_camera_count:.2f}')
print(f'std of camera count:  {std_camera_count:.2f}')
print(f'support threshold:    {SUPPORT_S:0.2f}')



mean camera count:    3858.59
std of camera count:  8506.71
support threshold:    7261.27


                                                                                

In [12]:
# finding the frequently visited cameras
frequent_cameras = camera_count.filter(lambda x: x[1] >= SUPPORT_S)

In [13]:
# broadcasting the frequent cameras
frequent_cameras_list = sc.broadcast(frozenset(frequent_cameras.map(lambda x: x[0]).collect()))

                                                                                

In [14]:
# remove cameras which are not frequents
def remove_infrequent_cameras(basket):
    return tuple(item for item in basket if item in frequent_cameras_list.value)

baskets_with_frequent_cameras = camera_baskets.map(remove_infrequent_cameras)    
baskets_with_frequent_cameras = baskets_with_frequent_cameras.filter(lambda x: len(x)<350)

In [15]:
# creating two tuples
def create_two_tuple(basket):
    n = len(basket)
    two_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            two_tuple = tuple(sorted([basket[i], basket[j]]))
            if len(set(two_tuple)) == 2:
                two_tuples.append((two_tuple, 1))
    return two_tuples

frequent_two_tuples = baskets_with_frequent_cameras.flatMap(create_two_tuple).reduceByKey(add).filter(lambda x: x[1] >= SUPPORT_S)

In [16]:
# results
top_two_tuples = frequent_two_tuples.sortBy(lambda x: x[1], ascending=False).take(10)
print(f'number of two tuples is: {frequent_two_tuples.count()}')
print('===================================')
print('===== top frequent two tuples =====')
for two_tuple in top_two_tuples:
    print(two_tuple)




number of two tuples is: 33
===== top frequent two tuples =====
(('22010039', '22010061'), 42802)
(('22010040', '22010061'), 38219)
(('22010053', '22010061'), 33258)
(('22010047', '22010061'), 29518)
(('22010039', '22010040'), 29389)
(('22010039', '22010053'), 24383)
(('22010044', '22010061'), 22361)
(('22010040', '22010053'), 22024)
(('22010039', '22010047'), 21202)
(('22010048', '22010061'), 20349)


                                                                                

In [17]:
# creating three tuples
def create_three_tuples(basket):
    n = len(basket)
    three_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            for k in range(j+1, n):
                three_tuple = tuple(sorted([basket[i], basket[j], basket[k]]))
                if len(set(three_tuple)) == 3:
                    three_tuples.append((three_tuple, 1))
    return three_tuples

frequent_three_tuples = baskets_with_frequent_cameras.flatMap(create_three_tuples).reduceByKey(add).filter(lambda x: x[1] >= SUPPORT_S)

In [18]:
# results
top_three_tuples = frequent_three_tuples.sortBy(lambda x: x[1], ascending=False).take(10)
print(f'number of three tuples is: {frequent_three_tuples.count()}')
print('=================================')
print('===== top frequent three tuples =====')
for three_tuple in top_three_tuples:
    print(three_tuple)

                                                                                

number of three tuples is: 267
===== top frequent three tuples =====
(('22010039', '22010040', '22010061'), 1312097)
(('22010039', '22010053', '22010061'), 1027128)
(('22010040', '22010053', '22010061'), 950212)
(('22010039', '22010047', '22010061'), 917659)
(('22010040', '22010047', '22010061'), 854912)
(('22010039', '22010040', '22010053'), 772502)
(('22010047', '22010053', '22010061'), 760622)
(('22010039', '22010044', '22010061'), 714274)
(('22010039', '22010040', '22010047'), 672416)
(('22010039', '22010048', '22010061'), 666648)




## SON Algorithm

In [11]:
# setting the hyperparameters and spliting the data into chunks
NUM_OF_CHUNKS = 3
RELAXING_FACTOR = 1.2
SUPPORT_S = 3000
SON_SUPPORT_S = SUPPORT_S/(NUM_OF_CHUNKS * RELAXING_FACTOR)

son_final_frequent_two_tuples = sc.parallelize([])
son_final_frequent_three_tuples = sc.parallelize([])

def hash_data(basket):
    x = sum(int(camera_code) for camera_code in basket) % NUM_OF_CHUNKS
    return (x, basket)

hashed_baskets = camera_baskets.map(hash_data)

rdd1 = hashed_baskets.filter(lambda x: x[0] == 0).values()
rdd2 = hashed_baskets.filter(lambda x: x[0] == 1).values()
rdd3 = hashed_baskets.filter(lambda x: x[0] == 2).values()

data_chunks = [rdd1, rdd2, rdd3]

In [12]:
# creating candidates

def son_create_two_tuples(basket):
    n = len(basket)
    two_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            if basket[i] in son_frequent_cameras_list.value and basket[j] in son_frequent_cameras_list.value:
                two_tuple = tuple(sorted([basket[i], basket[j]]))
                if len(set(two_tuple)) == 2:
                    two_tuples.append((two_tuple, 1))
    return two_tuples

def son_create_three_tuples(basket):
    n = len(basket)
    three_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            for k in range(j+1, n):
                if basket[i] in son_frequent_cameras_list.value and basket[j] in son_frequent_cameras_list.value and basket[k] in son_frequent_cameras_list.value:
                    three_tuple = tuple(sorted([basket[i], basket[j], basket[k]]))
                    if len(set(three_tuple)) == 3:
                        three_tuples.append((three_tuple, 1))
    return three_tuples
                    
def son_extract_cameras(basket):
    return [(camera_code, 1) for camera_code in basket]

def son_remove_infrequent_cameras(basket):
    return tuple(camera_code for camera_code in basket if camera_code in son_frequent_cameras_list.value)

for chunk in data_chunks:
    camera_count = chunk.flatMap(son_extract_cameras).reduceByKey(add)
    
    son_frequent_cameras = camera_count.filter(lambda x: x[1] >= SON_SUPPORT_S)
    son_frequent_cameras_list = sc.broadcast(frozenset(son_frequent_cameras.keys().collect()))
    
    son_baskets_with_frequent_cameras = chunk.map(son_remove_infrequent_cameras)
    son_baskets_with_frequent_cameras = son_baskets_with_frequent_cameras.filter(lambda x: len(x) < 350)
    
    son_frequent_two_tuples = son_baskets_with_frequent_cameras.flatMap(son_create_two_tuples).reduceByKey(add).filter(lambda x: x[1] >= SON_SUPPORT_S)
    son_frequent_three_tuples = son_baskets_with_frequent_cameras.flatMap(son_create_three_tuples).reduceByKey(add).filter(lambda x: x[1] >= SON_SUPPORT_S)
    
    son_final_frequent_two_tuples = son_final_frequent_two_tuples.union(son_frequent_two_tuples)
    son_final_frequent_three_tuples = son_final_frequent_three_tuples.union(son_frequent_three_tuples)


                                                                                

In [13]:
# broadcasting the candidates
son_final_frequent_two_tuples_list = sc.broadcast(frozenset(son_final_frequent_two_tuples.keys().collect()))
son_final_frequent_three_tuples_list = sc.broadcast(frozenset(son_final_frequent_three_tuples.keys().collect()))

                                                                                

In [20]:
# verifying the candidates
def verify_two_tuple(basket):
    n = len(basket)
    two_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            two_tuple = tuple(sorted([basket[i], basket[j]]))
            if two_tuple in son_final_frequent_two_tuples_list.value and len(set(two_tuple)) == 2:
                two_tuples.append((two_tuple, 1))
    return two_tuples

def verify_three_tuple(basket):
    n = len(basket)
    three_tuples = []
    
    for i in range(n):
        for j in range(i+1, n):
            for k in range(j+1, n):
                three_tuple = tuple(sorted([basket[i], basket[j], basket[k]]))
                if three_tuple in son_final_frequent_three_tuples_list.value and len(set(three_tuple)) == 3:
                    three_tuples.append((three_tuple, 1))
    return three_tuples

son_camera_baskets = baskets_with_frequent_cameras

son_final_frequent_two_tuples = son_camera_baskets.flatMap(verify_two_tuple).reduceByKey(add).filter(lambda x: x[1] >= SUPPORT_S)
son_final_frequent_three_tuples = son_camera_baskets.flatMap(verify_three_tuple).reduceByKey(add).filter(lambda x: x[1]>= SUPPORT_S)


In [19]:
# results
top_two_tuples = son_final_frequent_two_tuples.sortBy(lambda x: x[1], ascending=False).take(10)
print(f'number of two tuples is: {son_final_frequent_two_tuples.count()}')
print('===================================')
print('===== top frequent two tuples =====')
for two_tuple in top_two_tuples:
    print(two_tuple)


                                                                                

number of two tuples is: 33
===== top frequent two tuples =====
(('22010039', '22010061'), 42802)
(('22010040', '22010061'), 38219)
(('22010053', '22010061'), 33258)
(('22010047', '22010061'), 29518)
(('22010039', '22010040'), 29389)
(('22010039', '22010053'), 24383)
(('22010044', '22010061'), 22361)
(('22010040', '22010053'), 22024)
(('22010039', '22010047'), 21202)
(('22010048', '22010061'), 20349)




In [20]:
# results
top_three_tuples = son_final_frequent_three_tuples.sortBy(lambda x: x[1], ascending=False).take(10)
print(f'number of three tuples is: {son_final_frequent_three_tuples.count()}')
print('=================================')
print('===== top frequent three tuples =====')
for three_tuple in top_three_tuples:
    print(three_tuple)
    

                                                                                

number of three tuples is: 267
===== top frequent three tuples =====
(('22010039', '22010040', '22010061'), 1312097)
(('22010039', '22010053', '22010061'), 1027128)
(('22010040', '22010053', '22010061'), 950212)
(('22010039', '22010047', '22010061'), 917659)
(('22010040', '22010047', '22010061'), 854912)
(('22010039', '22010040', '22010053'), 772502)
(('22010047', '22010053', '22010061'), 760622)
(('22010039', '22010044', '22010061'), 714274)
(('22010039', '22010040', '22010047'), 672416)
(('22010039', '22010048', '22010061'), 666648)


