In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from operator import add
from tqdm import tqdm
import random
import numpy as np

### Loading data

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark_Processor") \
    .master("local[*]") \
    .getOrCreate()

sc=spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/09 23:38:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_df = spark.read.option('header', 'true').csv('MDA2022ProjectData.csv').sample(fraction=0.01)

In [4]:
data_df = data_df.filter(data_df['ORIGINE_CAR_KEY'] == data_df['FINAL_CAR_KEY'])
data_df = data_df.select(['DEVICE_CODE', 'ORIGINE_CAR_KEY'])
data_rdd = data_df.rdd.map(lambda x: (x['DEVICE_CODE'], x['ORIGINE_CAR_KEY']))

### Defining class for pin node and board node

In [5]:
# Node class for creating our bipartite graph

class Node:
    def __init__(self, code):
        self.neighbours = []
        self.count = 0
        self.code = code
    
    def add_neighbour(self, neighbour):
        self.neighbours.append(neighbour)
    
    def get_neighbours(self):
        return self.neighbours
    
    def add_count(self):
        self.count = self.count + 1 
    
    def get_random_neighbour(self):
        return random.choice(self.get_neighbours())


### Assigning an index to each camera and creating node list

In [6]:
camera_list = data_rdd.map(lambda x: x[0]).distinct().collect()
camera_node_list = tuple(Node(camera) for camera in camera_list)
camera_name2index = {camera_name: index for index, camera_name in enumerate(camera_list)}

                                                                                

### Assigning an index to each car and creating node list

In [7]:
car_list = data_rdd.map(lambda x: x[1]).distinct().collect()
car_node_list = tuple(Node(car) for car  in car_list)
car_name2index = {car_name: index for index, car_name in enumerate(car_list)}

                                                                                

### Constructing the grapgh

In [8]:
data_list = data_rdd.collect()
for record in tqdm(data_list):
    camera_name = record[0]
    car_name = record[1]
    pin_node = camera_node_list[camera_name2index[camera_name]]
    board_node = car_node_list[car_name2index[car_name]]
    pin_node.add_neighbour(board_node)
    board_node.add_neighbour(pin_node)


100%|█████████████████████████████| 1244995/1244995 [00:01<00:00, 888079.28it/s]


### Pixie algorithm

In [9]:
def pixie_algorithm(query_node, steps, alpha):
    pin_node = query_node
    for i in tqdm(range(steps)):
        pin_node.add_count()
        board_node = pin_node.get_random_neighbour()
        pin_node = board_node.get_random_neighbour()
        if np.random.rand() <= alpha:
            pin_node = query_node


### Querying a sample camera
Here a choose a random camera and find similar cameras to it using pixie's algorithm

In [10]:
query_camera = random.choice(camera_node_list)
pixie_algorithm(query_camera, 500000, 0.008)

100%|███████████████████████████████| 500000/500000 [00:01<00:00, 457519.74it/s]


In [11]:
proximity_list = sorted(camera_node_list, key= lambda x: x.count, reverse=True)

In [13]:
print(f'code of query camera:   {query_camera.code}')
print('------- Top 10 similar cameras -------')
for i in range(10):
    camera = proximity_list[i]
    print(f'{i+1}- camera code: {camera.code}         visited: {camera.count}')

code of query camera:   22009928
------- Top 10 similar cameras -------
1- camera code: 22009928         visited: 36005
2- camera code: 900212         visited: 9593
3- camera code: 900269         visited: 9363
4- camera code: 900244         visited: 9128
5- camera code: 100700853         visited: 8560
6- camera code: 900142         visited: 7983
7- camera code: 631634         visited: 7874
8- camera code: 900222         visited: 6548
9- camera code: 900101         visited: 5890
10- camera code: 900218         visited: 5726
