# Convert CSV to parquet files
While the CSV files can be used to load data into PostgreSQL (with pgAdmin) and MySQL (with Workbench), we use this notebook and Spark to load the CSV and save them as parquet files which can then be used on cloud services such as Databricks or AWS SageMaker. 

In [3]:
import findspark
# Initialise findspark 
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Start a Spark session
spark = SparkSession.builder.appName("Used Cars ETL").getOrCreate()


In [11]:
def write_csv_to_parquet(filename):
    # Import files
    csv_path = f'cleaned_csv/{filename}.csv'
    spark.sparkContext.addFile(csv_path)

    # Add CSV to DataFrame
    print("Loading file...", end='\r')
    df = spark.read.csv(SparkFiles.get(f"{filename}.csv"), sep=",", header=True)
    print("Loading file... Done.")
    
    # Export DataFrame to parquet file
    print("Write parquet files...", end='\r')
    df.write.parquet(f'parquet/{filename}', mode='overwrite')
    print("Write parquet files... Done.")

In [12]:
write_csv_to_parquet('location')

Loading file... Done.
Write parquet files... Done.


In [13]:
write_csv_to_parquet('listing')

Loading file... Done.
Write parquet files... Done.


In [14]:
write_csv_to_parquet('usage')

Loading file... Done.
Write parquet files... Done.


In [15]:
write_csv_to_parquet('vehicle')

Loading file... Done.
Write parquet files... Done.
