In [1]:
from pyspark.sql import SparkSession
import seaborn as sns
import matplotlib as plt
import altair as alt
import plotly.express as px

In [2]:
df_path = r"F:\Datasets\CSV datasets\vehicles.csv"

In [3]:
spark = SparkSession.builder.appName('Vehicles-Dataset').getOrCreate()

In [4]:
from pyspark.sql.types import StringType, StructType, IntegerType, FloatType, DoubleType, StructField, LongType, ShortType

# PySpark dtypes conversion cant throw error if conversion is incorrect

In [5]:
df = spark.read.csv(df_path, header=True, inferSchema=True)

In [6]:
df.count()

441802

In [7]:
df.schema

StructType([StructField('id', StringType(), True), StructField('url', StringType(), True), StructField('region', StringType(), True), StructField('region_url', StringType(), True), StructField('price', StringType(), True), StructField('year', StringType(), True), StructField('manufacturer', StringType(), True), StructField('model', StringType(), True), StructField('condition', StringType(), True), StructField('cylinders', StringType(), True), StructField('fuel', StringType(), True), StructField('odometer', StringType(), True), StructField('title_status', StringType(), True), StructField('transmission', StringType(), True), StructField('VIN', StringType(), True), StructField('drive', StringType(), True), StructField('size', StringType(), True), StructField('type', StringType(), True), StructField('paint_color', StringType(), True), StructField('image_url', StringType(), True), StructField('description', StringType(), True), StructField('county', StringType(), True), StructField('state',

In [8]:
print(df.schema)

StructType([StructField('id', StringType(), True), StructField('url', StringType(), True), StructField('region', StringType(), True), StructField('region_url', StringType(), True), StructField('price', StringType(), True), StructField('year', StringType(), True), StructField('manufacturer', StringType(), True), StructField('model', StringType(), True), StructField('condition', StringType(), True), StructField('cylinders', StringType(), True), StructField('fuel', StringType(), True), StructField('odometer', StringType(), True), StructField('title_status', StringType(), True), StructField('transmission', StringType(), True), StructField('VIN', StringType(), True), StructField('drive', StringType(), True), StructField('size', StringType(), True), StructField('type', StringType(), True), StructField('paint_color', StringType(), True), StructField('image_url', StringType(), True), StructField('description', StringType(), True), StructField('county', StringType(), True), StructField('state',

In [9]:
df.show(5)

+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|        id|                 url|              region|          region_url|price|year|manufacturer|model|condition|cylinders|fuel|odometer|title_status|transmission| VIN|drive|size|type|paint_color|image_url|description|county|state| lat|long|posting_date|
+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|7222695916|https://prescott....|            prescott|https://prescott....| 6000|NULL|        NULL| NULL|     NULL|     NULL|NULL|    NULL|        NULL|        NULL|NULL| NULL|NULL|NULL|       NULL|     NULL|       NULL|  NULL|  

In [10]:
from pyspark.sql.functions import col, when, sum

In [11]:
df.select([
    sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns
]).show()

+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+------+------+------+------+-----------+---------+-----------+------+-----+-----+-----+------------+
| id| url|region|region_url|price|year|manufacturer|model|condition|cylinders| fuel|odometer|title_status|transmission|   VIN| drive|  size|  type|paint_color|image_url|description|county|state|  lat| long|posting_date|
+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+------+------+------+------+-----------+---------+-----------+------+-----+-----+-----+------------+
|  0|9884|  6901|      6533| 6446|7890|       28937|17506|   187143|   190798|16344|   17801|       21618|       15932|174486|144178|319997|106892|     143839|    13732|      13733|382283|23077|25397|24715|       22477|
+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+-

In [12]:
df.filter(df['id'].isNull()).count()

0

In [13]:
df.filter(
    (col('price') > 6_000) &
    (col('region') != 'bellingham') &
    col('manufacturer').isNotNull()
).orderBy('price', ascending=False).show()

+----------+--------------------+--------------------+--------------------+--------+----+-------------+--------------------+---------+-----------+------+--------+------------+------------+-----------------+-----+---------+-----------+-----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+--------------------+
|        id|                 url|              region|          region_url|   price|year| manufacturer|               model|condition|  cylinders|  fuel|odometer|title_status|transmission|              VIN|drive|     size|       type|paint_color|           image_url|         description| county|               state|                 lat|                long|        posting_date|
+----------+--------------------+--------------------+--------------------+--------+----+-------------+--------------------+---------+-----------+------+--------+------------+------------+-----------------+-----+---------+-----------+----

In [14]:
df.withColumn(
    'price_above_6000', col('price') > 6_000
).show()

+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+----------------+
|        id|                 url|              region|          region_url|price|year|manufacturer|model|condition|cylinders|fuel|odometer|title_status|transmission| VIN|drive|size|type|paint_color|image_url|description|county|state| lat|long|posting_date|price_above_6000|
+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+----------------+
|7222695916|https://prescott....|            prescott|https://prescott....| 6000|NULL|        NULL| NULL|     NULL|     NULL|NULL|    NULL|        NULL|        NULL|NULL| NULL|NU

In [15]:
df.groupBy('region').count().orderBy('count', ascending=False)

DataFrame[region: string, count: bigint]

In [16]:
print(df.select([
    sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns
]).show())

+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+------+------+------+------+-----------+---------+-----------+------+-----+-----+-----+------------+
| id| url|region|region_url|price|year|manufacturer|model|condition|cylinders| fuel|odometer|title_status|transmission|   VIN| drive|  size|  type|paint_color|image_url|description|county|state|  lat| long|posting_date|
+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+------+------+------+------+-----------+---------+-----------+------+-----+-----+-----+------------+
|  0|9884|  6901|      6533| 6446|7890|       28937|17506|   187143|   190798|16344|   17801|       21618|       15932|174486|144178|319997|106892|     143839|    13732|      13733|382283|23077|25397|24715|       22477|
+---+----+------+----------+-----+----+------------+-----+---------+---------+-----+--------+------------+------------+-

In [17]:
df.show(5)

+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|        id|                 url|              region|          region_url|price|year|manufacturer|model|condition|cylinders|fuel|odometer|title_status|transmission| VIN|drive|size|type|paint_color|image_url|description|county|state| lat|long|posting_date|
+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|7222695916|https://prescott....|            prescott|https://prescott....| 6000|NULL|        NULL| NULL|     NULL|     NULL|NULL|    NULL|        NULL|        NULL|NULL| NULL|NULL|NULL|       NULL|     NULL|       NULL|  NULL|  

In [18]:
df.groupBy(col('year')).agg({'price': 'sum'}).orderBy('year', ascending=False).show()

+--------------------+-----------+
|                year| sum(price)|
+--------------------+-----------+
|                  wa|       NULL|
|                  dc|       NULL|
|           47.656773|       NULL|
|           45.382346|       NULL|
|           40.624962|       NULL|
|             34.1847|       NULL|
|                2022|  3639818.0|
|2021-05-04T20:41:...| -73.829146|
|2021-05-04T20:33:...|-122.605132|
|2021-05-04T20:31:...|-122.605132|
|2021-05-04T20:29:...|-122.605132|
|2021-05-04T20:21:...| -74.042702|
|2021-05-04T19:56:...| -73.829146|
|2021-05-04T18:56:...| -73.829146|
|2021-05-04T18:56:...| -74.042702|
|2021-05-04T18:46:...| -73.829146|
|2021-05-04T18:31:...|  -97.13379|
|2021-05-04T18:06:...| -122.57822|
|2021-05-04T18:01:...| -106.38551|
|2021-05-04T17:46:...| -73.829146|
+--------------------+-----------+
only showing top 20 rows



In [19]:
from pyspark.sql.functions import year

In [20]:
df.select(year('year')).collect()

[Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=None),
 Row(year(year)=2014),
 Row(year(year)=2010),
 Row(year(year)=2020),
 Row(year(year)=2017),
 Row(year(year)=2013),
 Row(year(year)=2012),
 Row(year(year)=2016),
 Row(year(year)=2019),
 Row(year(year)=2016),
 Row(year(year)=2011),
 Row(year(year)=1992),
 Row(year(year)=2017),
 Row(year(year)=2017),
 Row(year(year)=2016),
 Row(year(year)=2014),
 Row(year(year)=2016),
 Row(year(y

In [21]:
df.show(5)

+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|        id|                 url|              region|          region_url|price|year|manufacturer|model|condition|cylinders|fuel|odometer|title_status|transmission| VIN|drive|size|type|paint_color|image_url|description|county|state| lat|long|posting_date|
+----------+--------------------+--------------------+--------------------+-----+----+------------+-----+---------+---------+----+--------+------------+------------+----+-----+----+----+-----------+---------+-----------+------+-----+----+----+------------+
|7222695916|https://prescott....|            prescott|https://prescott....| 6000|NULL|        NULL| NULL|     NULL|     NULL|NULL|    NULL|        NULL|        NULL|NULL| NULL|NULL|NULL|       NULL|     NULL|       NULL|  NULL|  

In [22]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- region_url: string (nullable = true)
 |-- price: string (nullable = true)
 |-- year: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- condition: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- odometer: string (nullable = true)
 |-- title_status: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- VIN: string (nullable = true)
 |-- drive: string (nullable = true)
 |-- size: string (nullable = true)
 |-- type: string (nullable = true)
 |-- paint_color: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- description: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)
 |-- posting_date: string (nu

In [23]:
df.count()

441802

In [24]:
df.select(['price', 'year', 'manufacturer']).show(5)

+-----+----+------------+
|price|year|manufacturer|
+-----+----+------------+
| 6000|NULL|        NULL|
|11900|NULL|        NULL|
|21000|NULL|        NULL|
| 1500|NULL|        NULL|
| 4900|NULL|        NULL|
+-----+----+------------+
only showing top 5 rows



In [25]:
df.select(col('year'), col('price'), col('manufacturer')).show(5)

+----+-----+------------+
|year|price|manufacturer|
+----+-----+------------+
|NULL| 6000|        NULL|
|NULL|11900|        NULL|
|NULL|21000|        NULL|
|NULL| 1500|        NULL|
|NULL| 4900|        NULL|
+----+-----+------------+
only showing top 5 rows



In [26]:
from pyspark.sql.types import (StringType, StructType, IntegerType, FloatType, DoubleType, StructField, LongType,
                               ShortType, YearMonthIntervalType, DateType, CharType)

In [27]:
year_not_null = df.filter(col('year').isNotNull())

year_not_null.withColumns({
    'id': col('id').cast(LongType()),
    'url': col('url').cast(StringType()),
    'region': col('region').cast(StringType()),
    'price': col('price').cast(IntegerType()),
    'year': col('year').cast(ShortType()),
    'manufacturer': col('manufacturer').cast(StringType()),
}).show(5, truncate=False)

+----------+------------------------------------------------------------------------------------------+------+-----------------------------+-----+----+------------+------------------------+---------+-----------+----+--------+------------+------------+-----------------+-----+---------+------+-----------+-------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
from pyspark.sql.functions import min, max

In [29]:
df.agg(min(col('price')).alias('min_price'), max(col('price')).alias('max_price')).show()

+---------+---------+
|min_price|max_price|
+---------+---------+
|         |       wa|
+---------+---------+



In [30]:
spark.stop()