# SVD

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,TimestampType

spark = SparkSession \
    .builder \
    .appName("Spark_Processor") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

schema = StructType([ \
        StructField("DEVICE_CODE", IntegerType(), True), 
        StructField("SYSTEM_ID",IntegerType(),True), \
        StructField("ORIGINE_CAR_KEY",IntegerType(),True), \
        StructField("FINAL_CAR_KEY", IntegerType(),True), \
        StructField("CHECK_STATUS_KEY", IntegerType(), True), \
        StructField("COMPANY_ID", StringType(), True), \
        StructField("PASS_DAY_TIME", TimestampType(), True)
    ])

22/02/08 11:26:48 WARN Utils: Your hostname, jupyter resolves to a loopback address: 127.0.1.1; using 172.16.199.17 instead (on interface eth0)
22/02/08 11:26:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/08 11:26:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/08 11:26:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


---- [('spark.sql.warehouse.dir', 'file:/home/user1/jupyter/MassiveDataAnalytics/project/spark-warehouse'), ('spark.executor.id', 'driver'), ('spark.app.name', 'Spark_Processor'), ('spark.driver.host', '172.16.199.17'), ('spark.executor.memory', '64g'), ('spark.app.startTime', '1644319609594'), ('spark.driver.memory', '64g'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.driver.port', '46397'), ('spark.submit.deployMode', 'client'), ('spark.app.id', 'local-1644319611673'), ('spark.ui.showConsoleProgress', 'true')]


In [3]:
df = spark.read.csv('Traffic.csv',header=True,schema=schema)
df.show(1)

[Stage 0:>                                                          (0 + 1) / 1]

+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|DEVICE_CODE|SYSTEM_ID|ORIGINE_CAR_KEY|FINAL_CAR_KEY|CHECK_STATUS_KEY|COMPANY_ID|      PASS_DAY_TIME|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|     200501|       81|       10477885|     10477885|               5|       161|2021-06-01 03:54:39|
|        155|       81|       87625017|     87625017|               5|       161|2021-06-01 04:14:21|
|     631757|       81|        8652928|      8652928|               5|       161|2021-06-01 03:58:57|
|     631757|       81|        8548123|      8548123|               5|       161|2021-06-01 04:01:38|
|     631757|       81|       24715264|     24715264|               5|       161|2021-06-01 03:56:57|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
only showing top 5 rows



                                                                                

In [None]:
from pyspark.sql.functions import dayofyear
df = df.withColumn('day', dayofyear(df.PASS_DAY_TIME))
df = df.filter(df.day != 159)
df.show(1)

In [3]:
DATA_COUNT = df.count()
CAR_COUNT = df.select('FINAL_CAR_KEY').distinct().count()
CAMERA_COUNT = df.select('DEVICE_CODE').distinct().count()
print('all count: ', DATA_COUNT)
print('car count: ', CAR_COUNT)
print('camera count: ', CAMERA_COUNT)



all count:  34989160
car count:  5487645
camera count:  1035


                                                                                

In [10]:
car_counts = df.groupBy('FINAL_CAR_KEY').count()
car_counts.sort('count', ascending=False).show(10)



+-------------+-------+
|FINAL_CAR_KEY|  count|
+-------------+-------+
|     64111706|1891912|
|     69177480|  80818|
|      8073331|  30194|
|     67682391|   6227|
|      7633319|   1579|
|     14919817|   1513|
|     19516092|   1385|
|      8396536|    804|
|     73138295|    730|
|     23975824|    550|
+-------------+-------+
only showing top 10 rows



                                                                                

In [None]:
traffic_cars = car_counts.filter(car_counts['count'] < 1000).sort('count', ascending=False).select('FINAL_CAR_KEY').take(100)
traffic_cars = [x.FINAL_CAR_KEY for x in traffic_cars]
top_cars_df = df.filter(df.FINAL_CAR_KEY.isin(traffic_cars))
top_cars_df.show(5)

In [6]:
TOP_CARS_RECORDS = top_cars_df.count()
TOP_CARS_RECORDS 

                                                                                

36907

In [7]:
camera_car_df = top_cars_df.groupBy('DEVICE_CODE', 'FINAL_CAR_KEY').count()
camera_car_df.show(1)



+-----------+-------------+-----+
|DEVICE_CODE|FINAL_CAR_KEY|count|
+-----------+-------------+-----+
|     631367|      8642668|    6|
|     631357|     13565906|   16|
|     900249|     11054045|   14|
|     900237|     17890990|   21|
|     631829|     11054045|    6|
+-----------+-------------+-----+
only showing top 5 rows



                                                                                

In [9]:
from pyspark.ml.feature import StringIndexer

camera_car_indexed_df = StringIndexer(inputCol='FINAL_CAR_KEY', outputCol='CAR_INDEX').fit(camera_car_df).transform(camera_car_df)
camera_car_indexed_df = StringIndexer(inputCol='DEVICE_CODE', outputCol='CAMERA_INDEX').fit(camera_car_indexed_df).transform(camera_car_indexed_df)
camera_car_indexed_df.show(2)



+-----------+-------------+-----+---------+------------+
|DEVICE_CODE|FINAL_CAR_KEY|count|CAR_INDEX|CAMERA_INDEX|
+-----------+-------------+-----+---------+------------+
|        135|     11086409|    1|      3.0|       139.0|
|     213301|     25826200|    1|      6.0|       226.0|
|        144|      8556436|    2|     40.0|        90.0|
|     202601|      8137760|    2|     37.0|        45.0|
|   22010087|      7633319|    7|      0.0|       234.0|
+-----------+-------------+-----+---------+------------+
only showing top 5 rows



                                                                                

In [11]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
utility_matrix = CoordinateMatrix(camera_car_indexed_df.rdd.map(lambda x: (int(x['CAR_INDEX']), int(x['CAMERA_INDEX']), x['count']) ))

                                                                                

In [15]:
svd = utility_matrix.toRowMatrix().computeSVD(100, computeU=True)
print(svd.s)

22/02/06 08:58:56 WARN RowMatrix: The input data is not directly cached, which may hurt performance if its parent RDDs are also uncached.


[505.0079430191521,448.4295807359056,234.00214006863334,179.1701725920204,164.08997913184254,128.8791269976936,89.96735321905756,85.39518265895747,81.27830846566535,73.68298955156429,71.16152478824884,63.00000000000001,54.67770373301821,50.08082000595019,46.65059215674066,40.75131283041665,39.49700737544456,38.81535112797081,37.883517887572744,37.477208085831876,36.48712513043299,34.10351087756282,32.95191581964757,31.668219074999598,29.382353059741757,28.007321050538476,27.81619066292238,27.173428229890284,25.9063839257756,25.51413614576888,24.916241572216165,24.226667981846077,24.030657094981326,23.365738603258077,19.73522502525707,18.44050839391655,17.983228407353685,17.882637049599147,17.512933811166867,16.650162052423365,16.219922944230152,15.917728847396386,15.32284427560125,14.385998218746812,13.55772093864274,12.77531198470423,12.033550152122068,11.946000054777999,11.341697289372854,11.021451077725494,10.16109555211842,9.735261830759898,9.507578905362777,9.188825793668242,8.823

22/02/06 08:59:06 WARN RowMatrix: Requested 100 singular values but only found 98 nonzeros.
22/02/06 08:59:06 WARN RowMatrix: The input data was not directly cached, which may hurt performance if its parent RDDs are also uncached.


In [13]:
print(utility_matrix.numCols())
print(utility_matrix.numRows())

331
100
