In [18]:
import findspark
findspark.init()

import pyspark as ps
from pyspark.sql import SparkSession

In [19]:
spark = ps.sql.SparkSession.builder.master("local[4]").appName("spark-intro").getOrCreate()

sc = spark.sparkContext

In [20]:
%%writefile input.txt
hello world
another line
yet another line
yet another another line

Overwriting input.txt


In [21]:
#textfile splits into rdd of lines from file
#collect puts lines into a list of strings

sc.textFile('input.txt').collect()

['hello world', 'another line', 'yet another line', 'yet another another line']

In [22]:
#textfile.map splits into rdd of lines from file then splits lines into substrings for each word
#collect puts lines into a list of list of strings

sc.textFile('input.txt') \
    .map(lambda x: x.split()) \
    .collect()

[['hello', 'world'],
 ['another', 'line'],
 ['yet', 'another', 'line'],
 ['yet', 'another', 'another', 'line']]

In [23]:
#doing the same actions as above, but with declared rdds

rdd = sc.textFile('input.txt')
rdd2 = rdd.map(lambda x: x.split())
rdd2.collect()

[['hello', 'world'],
 ['another', 'line'],
 ['yet', 'another', 'line'],
 ['yet', 'another', 'another', 'line']]

In [24]:
#flat map splits down to list of strings

flatmapRDD = sc.textFile('input.txt').flatMap(lambda x: x.split()).collect()
flatmapRDD

['hello',
 'world',
 'another',
 'line',
 'yet',
 'another',
 'line',
 'yet',
 'another',
 'another',
 'line']

In [25]:
%%writefile sales.txt
#ID    Date           Store   State  Product    Amount
101    11/13/2014     100     WA     331        300.00
104    11/18/2014     700     OR     329        450.00
102    11/15/2014     203     CA     321        200.00
106    11/19/2014     202     CA     331        330.00
103    11/17/2014     101     WA     373        750.00
105    11/19/2014     202     CA     321        200.00

Overwriting sales.txt


In [26]:
#top pulls the first n lines into a list of strings
sc.textFile('sales.txt').top(2)

['106    11/19/2014     202     CA     331        330.00',
 '105    11/19/2014     202     CA     321        200.00']

In [27]:
#take pulls the first n lines, including the headers, into a list of strings
sc.textFile('sales.txt').take(4)

['#ID    Date           Store   State  Product    Amount',
 '101    11/13/2014     100     WA     331        300.00',
 '104    11/18/2014     700     OR     329        450.00',
 '102    11/15/2014     203     CA     321        200.00']

In [28]:
#this is one giant string
sc.textFile('sales.txt').take(4)[1]

'101    11/13/2014     100     WA     331        300.00'

In [29]:
#map and split the rows first to get a list of list of strings
sc.textFile('sales.txt').map(lambda x: x.split()).take(3)

[['#ID', 'Date', 'Store', 'State', 'Product', 'Amount'],
 ['101', '11/13/2014', '100', 'WA', '331', '300.00'],
 ['104', '11/18/2014', '700', 'OR', '329', '450.00']]

In [30]:
#apply operator to columns

sc.textFile('sales.txt').map(lambda x: x.split()) \
    .filter(lambda x: not x[0].startswith('#')) \
    .map(lambda x: float(x[-1])) \
    .sum()

2230.0

In [31]:
#grouping and slicing using map, use map (lambda to identify proper values) and collect to export to list of list

sc.textFile('sales.txt').map(lambda x: x.split()) \
    .filter(lambda x: not x[0].startswith('#')) \
    .map(lambda x: (x[-3], float(x[-1]))) \
    .collect()

[('WA', 300.0),
 ('OR', 450.0),
 ('CA', 200.0),
 ('CA', 330.0),
 ('WA', 750.0),
 ('CA', 200.0)]

In [32]:
rdd = sc.textFile('sales.txt').map(lambda x: x.split()) \
    .filter(lambda x: not x[0].startswith('#')) \
    .map(lambda x: (x[-3], float(x[-1])))

rdd.collect()[0][1]

300.0

In [33]:
#use reducebykey to group values in each column by value (key)

# Create RDD of lines in txt
#drop header
#create RDD of just State and Amount
#Assumes first value is key to group (or reduce)
#use lambda to declare aggregation operation

sc.textFile('sales.txt').map(lambda x: x.split())\
    .filter(lambda x: not x[0].startswith('#'))\
    .map(lambda x: (x[-3], float(x[-1])))\
    .reduceByKey(lambda amount1, amount2: amount2 + amount1)\
    .collect()

[('CA', 730.0), ('WA', 1050.0), ('OR', 450.0)]

In [38]:
#find state with highest total revenue

sc.textFile('sales.txt').map(lambda x: x.split())\
    .filter(lambda x: not x[0].startswith('#'))\
    .map(lambda x: (x[-3], float(x[-1])))\
    .reduceByKey(lambda amount1, amount2: amount2 + amount1)\
    .sortBy(lambda state_amount: state_amount[1], ascending = False).collect()

[('WA', 1050.0), ('CA', 730.0), ('OR', 450.0)]

In [43]:
sc.textFile('input.txt') \
    .flatMap(lambda x: x.split())\
    .collect()

['hello',
 'world',
 'another',
 'line',
 'yet',
 'another',
 'line',
 'yet',
 'another',
 'another',
 'line']

In [44]:
sc.textFile('input.txt') \
    .flatMap(lambda x: x.split())\
    .map(lambda word: (word, 1))\
    .collect()

[('hello', 1),
 ('world', 1),
 ('another', 1),
 ('line', 1),
 ('yet', 1),
 ('another', 1),
 ('line', 1),
 ('yet', 1),
 ('another', 1),
 ('another', 1),
 ('line', 1)]

In [46]:
sc.textFile('input.txt') \
    .flatMap(lambda x: x.split())\
    .map(lambda word: (word, 1))\
    .reduceByKey(lambda a, b: (a + b))\
    .collect()

[('world', 1), ('line', 3), ('yet', 2), ('hello', 1), ('another', 4)]