In [1]:
from pyspark.sql import Row, SparkSession

**Creating the Spark Session (to work with Spark)**

In [2]:
spSession = SparkSession.builder.master('local').appName('appSparkSql').getOrCreate()

**Loading and storing the object in memory, for better performance**

In [3]:
rddCarsCsv = sc.textFile('aux/datasets/cars.csv')
rddCarsCsv.count()

198

In [4]:
rddCarsCsv.cache()

aux/datasets/cars.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

**Removing the header**

In [5]:
header = rddCarsCsv.first()
rddCars = rddCarsCsv.filter(lambda line: line != header)
rddCars.count()

197

In [6]:
def toNumber(line):
    
    attrList = line.split(',')
    
    doors = 1.0 if attrList[3] == 'two'   else 2.0
    body  = 1.0 if attrList[4] == 'sedan' else 2.0 
       
    newLine = Row( \
        DOORS = doors, \
        BODY = float(body), \
        HP = float(attrList[7]), \
        RPM = float(attrList[8]), \
        MPG = float(attrList[9]))
    
    return newLine

**Applying the function to the data**

In [7]:
carsMap = rddCars.map(toNumber)

**Persisting the result in memory, for better performance**

In [8]:
carsMap.persist()

PythonRDD[5] at RDD at PythonRDD.scala:53

**Listing the results**

In [9]:
carsMap.collect()

[Row(DOORS=1.0, BODY=2.0, HP=69.0, RPM=4900.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=48.0, RPM=5100.0, MPG=47.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5000.0, MPG=30.0),
 Row(DOORS=1.0, BODY=2.0, HP=62.0, RPM=4800.0, MPG=35.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=37.0),
 Row(DOORS=1.0, BODY=2.0, HP=60.0, RPM=5500.0, MPG=38.0),
 Row(DOORS=1.0, BODY=1.0, HP=69.0, RPM=5200.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=37.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=37.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5000.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=31.0),
 Row(DOORS=2.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=31.0),
 Row(DOORS=2.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=70.0, RPM=5400.0, MPG=38.0),
 Row(DOORS=1.0, BODY=2.0, HP=62.0, RPM=4800.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=68.0, RPM=5500.0, MPG=31.0),
 Row(DOORS=1.0, BODY=2.0, HP=58.0, RPM=4800.0, MPG=49.0),
 Row(DOORS=2.0