In [2]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.3.0-bin-hadoop2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop2"

In [3]:
# install findspark using pip
!pip install findspark

# install pyspark using pip
!pip install pyspark

import findspark
findspark.init("/content/spark-3.3.0-bin-hadoop2")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# upload files to work with
from google.colab import files

# in our case, upload data.csv
files.upload()

Saving data.csv to data.csv


{'data.csv': b'1,I,VXIO456XLBB630221,Nissan,Altima,2003,2002-05-08,Initial sales from TechMotors\n2,I,INU45KIOOPA343980,Mercedes,C300,2015,2014-01-01,Sold from EuroMotors\n3,A,VXIO456XLBB630221,,,,2014-07-02,Head on collision\n4,R,VXIO456XLBB630221,,,,2014-08-05,Repair transmission\n5,I,VOME254OOXW344325,Mercedes,E350,2015,2014-02-01,Sold from Carmax\n6,R,VOME254OOXW344325,,,,2015-02-06,Wheel allignment service\n7,R,VXIO456XLBB630221,,,,2015-01-01,Replace right head light\n8,I,EXOA00341AB123456,Mercedes,SL550,2016,2015-01-01,Sold from AceCars\n9,A,VOME254OOXW344325,,,,2015-10-01,Side collision\n10,R,VOME254OOXW344325,,,,2015-09-01,Changed tires\n11,R,EXOA00341AB123456,,,,2015-05-01,Repair engine\n12,A,EXOA00341AB123456,,,,2015-05-03,Vehicle rollover\n13,R,VOME254OOXW344325,,,,2015-09-01,Replace passenger side door\n14,I,UXIA769ABCC447906,Toyota,Camery,2017,2016-05-08,Initial sales from Carmax\n15,R,UXIA769ABCC447906,,,,2020-01-02,Initial sales from Carmax\n16,A,INU45KIOOPA343980,,,,202

In [1]:
#!/usr/bin/env python

from pyspark.sql import *
from pyspark import SparkContext

def extract_vin_key_value(line: str):
    values = line.split(",") # reads in data from data.csv
    type = values[1]
    vin_num = values[2]
    make = values[3]
    year = values[5]
    PairRDD = (make, year, type)
    return (vin_num, PairRDD)


sc = SparkContext("local", "My Application")
raw_rdd = sc.textFile("data.csv")

vin_kv = raw_rdd.map(lambda x: extract_vin_key_value(x))

enhance_make = vin_kv.groupByKey()\
                     .flatMap(lambda kv: kv[1])\
                     .filter(lambda x: len(x[1]) > 0 and len(x[2]) > 0)

make_kv = enhance_make.map(lambda x: x[0] + '-' + x[1])

make_kv_count = make_kv.map(lambda x: (x, 1))\
                       .reduceByKey(lambda x, y: x+y)


from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('autoinc_spark').getOrCreate()
column = ['make_year','count']
df = make_kv_count.toDF(column)

df.printSchema()
df.show(truncate=False)

root
 |-- make_year: string (nullable = true)
 |-- count: long (nullable = true)

+-------------+-----+
|make_year    |count|
+-------------+-----+
|Nissan-2003  |1    |
|Mercedes-2015|2    |
|Mercedes-2016|1    |
|Toyota-2017  |1    |
+-------------+-----+

