In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

### DataFrame

In [4]:
mtcars = spark.read.csv(path='../../data/mtcars.csv',
                       sep=',',
                       encoding="UTF-8",
                       comment=None,
                       header=True,
                       inferSchema=True)

In [5]:
mtcars.show(n=5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



## DataFrame to RDD

In [6]:
mtcars.rdd.take(2)

[Row(_c0='Mazda RX4', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.62, qsec=16.46, vs=0, am=1, gear=4, carb=4),
 Row(_c0='Mazda RX4 Wag', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.875, qsec=17.02, vs=0, am=1, gear=4, carb=4)]

In [7]:
mtcars.rdd.map(lambda x: (x['_c0'],x['mpg'])).take(4)

[('Mazda RX4', 21.0),
 ('Mazda RX4 Wag', 21.0),
 ('Datsun 710', 22.8),
 ('Hornet 4 Drive', 21.4)]

## RDD to DataFrame

In [8]:
rdd_raw = sc.textFile("../../data/mtcars.csv")
rdd_raw.take(5)

[',mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

### Extract the header

In [14]:
header = rdd_raw.map(lambda x: x.split(',')).filter(lambda x:x[1]== "mpg").collect()[0]

In [15]:
header[0]='model'

In [16]:
header

['model',
 'mpg',
 'cyl',
 'disp',
 'hp',
 'drat',
 'wt',
 'qsec',
 'vs',
 'am',
 'gear',
 'carb']

### Save the observations

In [17]:
rdd = rdd_raw.map(lambda x:x.split(',')).filter(lambda x:x[1]!='mpg')
rdd.take(2)

[['Mazda RX4',
  '21',
  '6',
  '160',
  '110',
  '3.9',
  '2.62',
  '16.46',
  '0',
  '1',
  '4',
  '4'],
 ['Mazda RX4 Wag',
  '21',
  '6',
  '160',
  '110',
  '3.9',
  '2.875',
  '17.02',
  '0',
  '1',
  '4',
  '4']]

### Convert RDD elements to RDD Row objects

In [18]:
from pyspark.sql import Row

In [20]:
my_dict = dict(zip(['a','b','c'],range(1,4)))
Row(**my_dict)

Row(a=1, b=2, c=3)

In [21]:
def list_to_row(keys,values):
    row_dict = dict(zip(keys,values))
    return Row(**row_dict)

In [22]:
rdd_rows = rdd.map(lambda x: list_to_row(header,x))
rdd_rows.take(5)

[Row(am='1', carb='4', cyl='6', disp='160', drat='3.9', gear='4', hp='110', model='Mazda RX4', mpg='21', qsec='16.46', vs='0', wt='2.62'),
 Row(am='1', carb='4', cyl='6', disp='160', drat='3.9', gear='4', hp='110', model='Mazda RX4 Wag', mpg='21', qsec='17.02', vs='0', wt='2.875'),
 Row(am='1', carb='1', cyl='4', disp='108', drat='3.85', gear='4', hp='93', model='Datsun 710', mpg='22.8', qsec='18.61', vs='1', wt='2.32'),
 Row(am='0', carb='1', cyl='6', disp='258', drat='3.08', gear='3', hp='110', model='Hornet 4 Drive', mpg='21.4', qsec='19.44', vs='1', wt='3.215'),
 Row(am='0', carb='2', cyl='8', disp='360', drat='3.15', gear='3', hp='175', model='Hornet Sportabout', mpg='18.7', qsec='17.02', vs='0', wt='3.44')]

In [23]:
df3 = spark.createDataFrame(rdd_rows)
df3.show(n=5)

+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
| am|carb|cyl|disp|drat|gear| hp|            model| mpg| qsec| vs|   wt|
+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
|  1|   4|  6| 160| 3.9|   4|110|        Mazda RX4|  21|16.46|  0| 2.62|
|  1|   4|  6| 160| 3.9|   4|110|    Mazda RX4 Wag|  21|17.02|  0|2.875|
|  1|   1|  4| 108|3.85|   4| 93|       Datsun 710|22.8|18.61|  1| 2.32|
|  0|   1|  6| 258|3.08|   3|110|   Hornet 4 Drive|21.4|19.44|  1|3.215|
|  0|   2|  8| 360|3.15|   3|175|Hornet Sportabout|18.7|17.02|  0| 3.44|
+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
only showing top 5 rows

