In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import random
from operator import add

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Processor") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/09 09:30:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Pre-Procesing

In [3]:
data_df = spark.read.csv('MDA2022ProjectData.csv', header=True).sample(fraction=0.001)
data_df = data_df.filter(data_df['ORIGINE_CAR_KEY'] == data_df['FINAL_CAR_KEY'])
data_df.show(10)

+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|DEVICE_CODE|SYSTEM_ID|ORIGINE_CAR_KEY|FINAL_CAR_KEY|CHECK_STATUS_KEY|COMPANY_ID|      PASS_DAY_TIME|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|   22010039|      284|       41640633|     41640633|               6|       161|2021-12-22 00:28:42|
|     900212|      283|       77694805|     77694805|               5|       264|2021-12-22 00:09:58|
|     900223|      283|       86389948|     86389948|               5|       264|2021-12-22 00:06:00|
|     900149|      283|       27582946|     27582946|               5|       264|2021-12-22 00:12:13|
|     900149|      283|       21418836|     21418836|               5|       264|2021-12-22 00:04:28|
|     900164|      283|       19187915|     19187915|               5|       264|2021-12-22 00:06:15|
|     631357|       81|       19857059|     19857059|               5|       161|2

In [4]:
data_df = data_df.select(['DEVICE_CODE', 'ORIGINE_CAR_KEY'])
data_df.show()

+-----------+---------------+
|DEVICE_CODE|ORIGINE_CAR_KEY|
+-----------+---------------+
|   22010039|       41640633|
|     900212|       77694805|
|     900223|       86389948|
|     900149|       27582946|
|     900149|       21418836|
|     900164|       19187915|
|     631357|       19857059|
|   22010031|       20292220|
|     900149|       17853250|
|     230107|       23330831|
|     900236|       17970376|
|  100701092|        8250637|
|   22009972|        7652860|
|   22010053|      104926289|
|     900246|        7634794|
|     900101|       52326830|
|     900155|       44543349|
|     205201|       32610813|
|     900174|       16642481|
|     211001|       15650532|
+-----------+---------------+
only showing top 20 rows



In [5]:
data_rdd = data_df.rdd.map(lambda x: (x['DEVICE_CODE'], x['ORIGINE_CAR_KEY']))

### Calculating Camera Count

In [6]:
# camera count: number of times each camera has detected a car
camera_count = data_rdd.groupBy(lambda x: x[0]).map(lambda x: (x[0], len(x[1])))

In [7]:
camera_count_mean = camera_count.values().mean()
camera_count_sigma = np.sqrt(camera_count.values().variance())

                                                                                

In [8]:
print('==== statistics of camera counts ====')
print(f'mean: {camera_count_mean:.2f}')
print(f'standard deviation: {camera_count_sigma:.2f}')

==== statistics of camera counts ====
mean: 155.28
standard deviation: 304.72


In [9]:
# now we find the frequently visited cameras
# we consider a camera a frequently visited camera if it is visited more than mean + std
frequent_camera = camera_count.filter(lambda x: x[1] >= camera_count_mean + camera_count_sigma)

In [10]:
print(f'number of frequent cameras: {frequent_camera.count()}')



number of frequent cameras: 77


                                                                                

### Calcucalting Car Count

In [12]:
# car count: number of times each car has passed by a camera
car_count = data_rdd.groupBy(lambda x: x[1]).map(lambda x: (x[0], len(x[1])))

In [13]:
car_count_mean = car_count.values().mean()
car_count_sigma = np.sqrt(car_count.values().variance())

                                                                                

In [14]:
print('==== statistics of ca counts ====')
print(f'mean: {car_count_mean:.2f}')
print(f'standard deviation: {car_count_sigma:.2f}')

==== statistics of ca counts ====
mean: 1.05
standard deviation: 1.04


In [15]:
# now we find the frequent cars(cars that vistit cameras frequenlt)
# we consider a car a frequent car if it is visited more than mean + 0.5 * std by cameras
frequent_cars = car_count.filter(lambda x: x[1] >= car_count_mean + 0.5 *car_count_sigma)

In [16]:
print(f'number of frequent cars: {frequent_cars.count()}')



number of frequent cars: 5385


                                                                                

### Assigning an index to each camera 

In [17]:
frequent_camera_list = frequent_camera.keys().collect()
camera2index = sc.broadcast({camera_code: index for index, camera_code in enumerate(frequent_camera_list)})
index2camera = sc.broadcast({index: camera_code for index, camera_code in enumerate(frequent_camera_list)})

                                                                                

### Assigning an index to each car

In [18]:
frequent_cars_list = frequent_cars.keys().collect()
car2index = sc.broadcast({car: index for index, car in enumerate(frequent_cars_list)})
index2car = sc.broadcast({index: car for index, car in enumerate(frequent_cars_list)})



### Creating Utility Matrix

In [19]:
def utility_count(row):
    car = row[1]
    camera = row[0]
    car_index = car2index.value[car]
    camera_index = camera2index.value[camera]
    return ((car_index, camera_index), 1)

car_camera_counts = data_rdd.filter(lambda x: x[0] in frequent_camera_list and x[1] in frequent_cars_list).map(utility_count)

In [20]:
utility_matrix = car_camera_counts.reduceByKey(add)

In [23]:
print(f'shape of uitlity matrix:    {(len(frequent_cars_list), len(frequent_camera_list))}')

shape of uitlity matrix:    (5385, 77)


In [22]:
# now we print some values of utility matrix with ((row, column), value) format sorted value:
utility_matrix.sortBy(lambda x: x[1], ascending=False).take(20)

                                                                                

[((5276, 6), 31),
 ((5276, 55), 29),
 ((5276, 26), 21),
 ((5276, 14), 20),
 ((2203, 1), 15),
 ((2203, 55), 12),
 ((2203, 18), 9),
 ((5276, 1), 8),
 ((2203, 20), 5),
 ((433, 11), 3),
 ((732, 66), 3),
 ((4075, 66), 3),
 ((212, 75), 3),
 ((2850, 46), 3),
 ((4196, 7), 3),
 ((1878, 12), 3),
 ((4679, 11), 2),
 ((4989, 8), 2),
 ((794, 30), 2),
 ((4390, 75), 2)]

### Finding SVD of utility matrix with CUR algorithm

#### Calculating norm of each column

In [29]:
def norm_of_column(item):
    return (item[0][1], item[1]**2)

column_norm = utility_matrix.map(norm_of_column).reduceByKey(add)

In [30]:
column_norms = [0 for _ in range(len(frequent_camera_list))]
for index, norm in column_norm.collect():
    column_norms[index] = norm
f_norm = sum(column_norms)
column_probs = [x/f_norm for x in column_norms]

                                                                                

In [31]:
print(f'Frobenius norm by summing column norms:  {f_norm}')

Frobenius norm by summing column norms:  10354


#### Calculating norm of each row

In [32]:
def norm_of_row(item):
    return (item[0][0], item[1]**2)

row_norm = utility_matrix.map(norm_of_row).reduceByKey(add)

In [33]:
row_norms = [0 for _ in range(len(frequent_cars_list))]
for index, norm in row_norm.collect():
    row_norms[index] = norm
f_norm = sum(row_norms)
row_probs = [x/f_norm for x in row_norms]

                                                                                

In [35]:
print(f'Frobenius norm by summing row norms:  {f_norm}')

Frobenius norm by summing row norms:  10354


### Selecting random rows

In [36]:
r = 10

In [37]:
# selecting each row and calculating the scaling factor for each selected row
# the scaling factor is calculated as defined by the book
random_row = random.choices(range(len(frequent_cars_list)), weights=row_probs, k=r)
unique_row = [*set(random_row)]
row_count = [random_row.count(row) for row in unique_row]
row_factor = [np.sqrt(k/(r*row_probs[index])) for k, index in zip(row_count, unique_row)]

### Selecting random columns

In [39]:
# selecting each row and calculating the scaling factor for each selected row
# the scaling factor is calculated as defined by the book
random_column = random.choices(range(len(frequent_camera_list)), weights=column_probs, k=r)
unique_column = [*set(random_column)]
column_count = [random_column.count(column) for column in unique_column]
column_factor = [np.sqrt(k/(r*row_probs[index])) for k, index in zip(column_count, unique_row)]

### Forming W matrix

In [42]:
print(f'shape of W:  ({len(unique_row), len(unique_column)})')

shape of W:  ((6, 7))


In [55]:
W_shape = (len(unique_row), len(unique_column))
W = np.zeros(W_shape)

for i, row in enumerate(unique_row):
    for j, column in enumerate(unique_column):
        query = utility_matrix.filter(lambda x: x[0] == (row, column)).collect()
        if len(query):
            W[i, j] = query[0][1]
        else:
            W[i, j] = 0
print('the calculated W matrix')
print(W)

                                                                                

the calculated W matrix
[[ 0.  0.  0.  0.  2.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 8.  0.  0.  0.  0. 29.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]




### Constructing U matrix

In [62]:
X, S, YT = np.linalg.svd(W)

In [64]:
Sigma = np.zeros(W.shape)
for i, value in enumerate(S):
    Sigma[i, i] = S[i]

In [77]:
# Constructing the U matrix
U = YT.T @ np.linalg.pinv(Sigma) **2 @ X.T

print('The calculated U Matrix:')
print(f'{U}')

The calculated U Matrix:
[[0.         0.         0.         0.         0.00029384 0.        ]
 [0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.        ]
 [0.25       0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.00106519 0.        ]
 [0.         0.         0.         0.         0.         0.        ]]
