# Configuration

- https://towardsdatascience.com/how-to-get-started-with-pyspark-1adc142456ec

In [1]:
# Most straightforward method
import pyspark
import pandas as pd
from pyspark_common import *

spark = pyspark.sql.SparkSession.builder \
        .master('local') \
        .appName('Spark Datatype') \
        .getOrCreate()

# Import data

In [2]:
auto = spark.read.csv('auto-mpg.csv', header=True)

In [3]:
auto.printSchema()

root
 |-- mpg: string (nullable = true)
 |-- cyl: string (nullable = true)
 |-- displ: string (nullable = true)
 |-- hp: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- accel: string (nullable = true)
 |-- yr: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- name: string (nullable = true)
 |-- color: string (nullable = true)
 |-- size: string (nullable = true)
 |-- marker: string (nullable = true)



In [4]:
auto.limit(10).show()

+----+---+-----+---+------+-----+---+------+--------------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|                name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+--------------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|        ford mustang|  red|27.370336111111108|     o|
| 9.0|  8|304.0|193|  4732| 18.5| 70|    US|            hi 1200d|green|62.199511111111114|     o|
|36.1|  4| 91.0| 60|  1800| 16.4| 78|  Asia|    honda civic cvcc| blue|               9.0|     x|
|18.5|  6|250.0| 98|  3525| 19.0| 77|    US|        ford granada|  red|         34.515625|     o|
|34.3|  4| 97.0| 78|  2188| 15.8| 80|Europe|           audi 4000| blue|13.298177777777777|     s|
|32.9|  4|119.0|100|  2615| 14.8| 81|  Asia|        datsun 200sx| blue|18.995069444444447|     x|
|32.2|  4|108.0| 75|  2265| 15.2| 80|  Asia|      toyota corolla| blue|         14.250625|     x|
|22.0|  4|121.0| 76|

In [5]:
to_pandas(auto)

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,color,size,marker
0,18.0,6,250.0,88,3139,14.5,71,US,ford mustang,red,27.370336111111108,o
1,9.0,8,304.0,193,4732,18.5,70,US,hi 1200d,green,62.19951111111112,o
2,36.1,4,91.0,60,1800,16.4,78,Asia,honda civic cvcc,blue,9.0,x
3,18.5,6,250.0,98,3525,19.0,77,US,ford granada,red,34.515625,o
4,34.3,4,97.0,78,2188,15.8,80,Europe,audi 4000,blue,13.298177777777775,s
5,32.9,4,119.0,100,2615,14.8,81,Asia,datsun 200sx,blue,18.995069444444447,x
6,32.2,4,108.0,75,2265,15.2,80,Asia,toyota corolla,blue,14.250625,x
7,22.0,4,121.0,76,2511,18.0,72,Europe,volkswagen 411 (sw),blue,17.514224999999996,s
8,15.0,8,302.0,130,4295,14.9,77,US,mercury cougar brougham,green,51.24173611111111,o
9,17.0,8,302.0,140,3449,10.5,70,US,ford torino,green,33.04333611111111,o


# Data type
## Convert Spark Dataframe to RDD

In [6]:
auto_rdd = auto.rdd
auto_rdd

MapPartitionsRDD[20] at javaToPython at NativeMethodAccessorImpl.java:0

In [7]:
auto_rdd.getNumPartitions()

1

In [8]:
type(auto_rdd)

pyspark.rdd.RDD

In [9]:
auto_rdd.count()

392

In [10]:
auto_rdd.collect()

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o'),
 Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o'),
 Row(mpg='36.1', cyl='4', displ='91.0', hp='60', weight='1800', accel='16.4', yr='78', origin='Asia', name='honda civic cvcc', color='blue', size='9.0', marker='x'),
 Row(mpg='18.5', cyl='6', displ='250.0', hp='98', weight='3525', accel='19.0', yr='77', origin='US', name='ford granada', color='red', size='34.515625', marker='o'),
 Row(mpg='34.3', cyl='4', displ='97.0', hp='78', weight='2188', accel='15.8', yr='80', origin='Europe', name='audi 4000', color='blue', size='13.298177777777777', marker='s'),
 Row(mpg='32.9', cyl='4', displ='119.0', hp='100', weight='2615', accel='14.8', yr='81', origin='Asia', name='datsun 200sx', color='blue', size=

In [11]:
auto_rdd.take(10)

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o'),
 Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o'),
 Row(mpg='36.1', cyl='4', displ='91.0', hp='60', weight='1800', accel='16.4', yr='78', origin='Asia', name='honda civic cvcc', color='blue', size='9.0', marker='x'),
 Row(mpg='18.5', cyl='6', displ='250.0', hp='98', weight='3525', accel='19.0', yr='77', origin='US', name='ford granada', color='red', size='34.515625', marker='o'),
 Row(mpg='34.3', cyl='4', displ='97.0', hp='78', weight='2188', accel='15.8', yr='80', origin='Europe', name='audi 4000', color='blue', size='13.298177777777777', marker='s'),
 Row(mpg='32.9', cyl='4', displ='119.0', hp='100', weight='2615', accel='14.8', yr='81', origin='Asia', name='datsun 200sx', color='blue', size=

In [12]:
auto_rdd.top(5)

[Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o'),
 Row(mpg='46.6', cyl='4', displ='86.0', hp='65', weight='2110', accel='17.9', yr='80', origin='Asia', name='mazda glc', color='blue', size='12.366944444444444', marker='x'),
 Row(mpg='44.6', cyl='4', displ='91.0', hp='67', weight='1850', accel='13.8', yr='80', origin='Asia', name='honda civic 1500 gl', color='blue', size='9.506944444444445', marker='x'),
 Row(mpg='44.3', cyl='4', displ='90.0', hp='48', weight='2085', accel='21.7', yr='80', origin='Europe', name='vw rabbit c (diesel)', color='blue', size='12.075625', marker='s'),
 Row(mpg='44.0', cyl='4', displ='97.0', hp='52', weight='2130', accel='24.6', yr='82', origin='Europe', name='vw pickup', color='blue', size='12.6025', marker='s')]

In [13]:
auto_rdd.countByKey()

defaultdict(int,
            {'18.0': 17,
             '9.0': 1,
             '36.1': 2,
             '18.5': 3,
             '34.3': 1,
             '32.9': 1,
             '32.2': 1,
             '22.0': 10,
             '15.0': 16,
             '17.0': 7,
             '44.0': 1,
             '24.5': 2,
             '32.0': 6,
             '14.0': 19,
             '13.0': 20,
             '36.0': 6,
             '31.0': 7,
             '21.5': 3,
             '19.0': 12,
             '16.0': 13,
             '23.0': 9,
             '26.0': 14,
             '24.0': 11,
             '21.0': 7,
             '31.3': 1,
             '32.7': 1,
             '17.6': 2,
             '28.0': 10,
             '18.1': 2,
             '29.0': 8,
             '35.1': 1,
             '16.5': 3,
             '29.9': 1,
             '27.2': 3,
             '32.1': 1,
             '12.0': 6,
             '25.0': 10,
             '28.4': 1,
             '30.9': 1,
             '20.0': 9,
             

In [14]:
auto_rdd.countByValue()

defaultdict(int,
            {Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o'): 1,
             Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o'): 1,
             Row(mpg='36.1', cyl='4', displ='91.0', hp='60', weight='1800', accel='16.4', yr='78', origin='Asia', name='honda civic cvcc', color='blue', size='9.0', marker='x'): 1,
             Row(mpg='18.5', cyl='6', displ='250.0', hp='98', weight='3525', accel='19.0', yr='77', origin='US', name='ford granada', color='red', size='34.515625', marker='o'): 1,
             Row(mpg='34.3', cyl='4', displ='97.0', hp='78', weight='2188', accel='15.8', yr='80', origin='Europe', name='audi 4000', color='blue', size='13.298177777777777', marker='s'): 1,
             Row(mpg='32.9', cyl='4', displ='119.0',

In [15]:
auto_rdd.max()

Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o')

In [16]:
auto_rdd.min()

Row(mpg='10.0', cyl='8', displ='307.0', hp='200', weight='4376', accel='15.0', yr='70', origin='US', name='chevy c20', color='green', size='53.19271111111111', marker='o')

In [17]:
for i in auto_rdd.take(5):
    print(i)
    print(i['name'])
    print()

Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o')
ford mustang

Row(mpg='9.0', cyl='8', displ='304.0', hp='193', weight='4732', accel='18.5', yr='70', origin='US', name='hi 1200d', color='green', size='62.199511111111114', marker='o')
hi 1200d

Row(mpg='36.1', cyl='4', displ='91.0', hp='60', weight='1800', accel='16.4', yr='78', origin='Asia', name='honda civic cvcc', color='blue', size='9.0', marker='x')
honda civic cvcc

Row(mpg='18.5', cyl='6', displ='250.0', hp='98', weight='3525', accel='19.0', yr='77', origin='US', name='ford granada', color='red', size='34.515625', marker='o')
ford granada

Row(mpg='34.3', cyl='4', displ='97.0', hp='78', weight='2188', accel='15.8', yr='80', origin='Europe', name='audi 4000', color='blue', size='13.298177777777777', marker='s')
audi 4000



In [18]:
auto_rdd.map(lambda x: x['name']).take(5)

['ford mustang', 'hi 1200d', 'honda civic cvcc', 'ford granada', 'audi 4000']

In [19]:
auto_rdd.flatMap(lambda x: x).take(15)

['18.0',
 '6',
 '250.0',
 '88',
 '3139',
 '14.5',
 '71',
 'US',
 'ford mustang',
 'red',
 '27.370336111111108',
 'o',
 '9.0',
 '8',
 '304.0']

## Dataframe sample will be List

In [20]:
auto_list = auto.take(1)
auto_list

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o')]

In [21]:
type(auto_list)

list

## Convert List to Dataframe

In [22]:
auto_df = spark.sparkContext.parallelize(auto_list).toDF()
auto_df.show()

+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|        name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|ford mustang|  red|27.370336111111108|     o|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+



In [23]:
type(auto_df)

pyspark.sql.dataframe.DataFrame

## Convert List to RDD

In [24]:
auto_rdd = spark.sparkContext.parallelize(auto_list)
auto_rdd

ParallelCollectionRDD[44] at parallelize at PythonRDD.scala:195

In [25]:
auto_rdd.take(10)

[Row(mpg='18.0', cyl='6', displ='250.0', hp='88', weight='3139', accel='14.5', yr='71', origin='US', name='ford mustang', color='red', size='27.370336111111108', marker='o')]

In [26]:
type(auto_rdd)

pyspark.rdd.RDD

## Convert RDD to Dataframe

In [27]:
auto_df = auto_rdd.toDF()
auto_df.show()

+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
| mpg|cyl|displ| hp|weight|accel| yr|origin|        name|color|              size|marker|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+
|18.0|  6|250.0| 88|  3139| 14.5| 71|    US|ford mustang|  red|27.370336111111108|     o|
+----+---+-----+---+------+-----+---+------+------------+-----+------------------+------+



In [28]:
type(auto_df)

pyspark.sql.dataframe.DataFrame

## Column

In [29]:
auto['marker']

Column<b'marker'>

In [30]:
auto['marker'].getField('0')

Column<b'marker[0]'>

In [31]:
auto['marker'].getItem(0)

Column<b'marker[0]'>