In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0e5062a432bee26003ba19eefb0f241e0af17b2a185c565cde163bbcfc97283f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
import pyspark as ps

sc = ps.SparkContext.getOrCreate()

In [3]:
# Creating RDDs
numbers = list(range(15))
rdd = sc.parallelize(numbers)

print("List Count: ", rdd.count())
print("List Max: ", rdd.max())
print("List Min: ", rdd.min())
print("List Mean: ", rdd.mean())
print("\n")
# Squaring Original RDD values
rdd2 = rdd.map(lambda x : x**2 )
print("Squared List Count: ", rdd2.count())
print("Squared List Max: ", rdd2.max())
print("Squared List Min: ", rdd2.min())
print("Squared List Mean: ", rdd2.mean())
print("\n")
# Creating RDD with even values
even = rdd2.filter(lambda x : x % 2 == 0)
print("Even List Count: ", even.count())
print("Even List Max: ", even.max())
print("Even List Min: ", even.min())
print("Even List Mean: ", even.mean())


List Count:  15
List Max:  14
List Min:  0
List Mean:  7.0


Squared List Count:  15
Squared List Max:  196
Squared List Min:  0
Squared List Mean:  67.66666666666667


Even List Count:  8
Even List Max:  196
Even List Min:  0
Even List Mean:  70.0


In [9]:
# Dataframes

from pyspark.sql import SparkSession

# Creating Session Object
spark = SparkSession \
  .builder \
  .appName("Pyspark_Example") \
  .config("spark.some.config.option", "some-value") \
  .getOrCreate()

In [10]:
spark

In [15]:
# Reading csv with headers

accounts = spark.read.option("header", 'true').csv('Ecommerce_data.csv')

In [16]:
accounts

DataFrame[Text: string, label: string]

In [18]:
# Looking at schema

accounts.printSchema()

root
 |-- Text: string (nullable = true)
 |-- label: string (nullable = true)



In [19]:
# Number of Records

accounts.count()

24021

In [28]:
# Reading JSON file

anscombe = spark.read.option("header", 'true').json('./sample_data/anscombe.json')

In [29]:
print(anscombe.printSchema())

anscombe

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- _corrupt_record: string (nullable = true)

None


DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [30]:
anscombe.columns

['Series', 'X', 'Y', '_corrupt_record']

In [31]:
# Group by Series
anscombe_series = anscombe.groupby('Series').sum()

In [32]:
anscombe_series

DataFrame[Series: string, sum(X): double, sum(Y): double]

In [33]:
# Joining sum dataset with original dataset

with_sum = anscombe.join(anscombe_series, 'Series', 'inner')

In [34]:
with_sum

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string, sum(X): double, sum(Y): double]

In [38]:
with_sum.show(5)

+------+----+----+---------------+------+------+
|Series|   X|   Y|_corrupt_record|sum(X)|sum(Y)|
+------+----+----+---------------+------+------+
|     I|10.0|8.04|           NULL|  99.0|  82.5|
|     I| 8.0|6.95|           NULL|  99.0|  82.5|
|     I|13.0|7.58|           NULL|  99.0|  82.5|
|     I| 9.0|8.81|           NULL|  99.0|  82.5|
|     I|11.0|8.33|           NULL|  99.0|  82.5|
+------+----+----+---------------+------+------+
only showing top 5 rows



In [39]:
# applying filter method

series_I = with_sum.filter(with_sum.Series == 'I')

In [40]:
series_I.count()

11

In [42]:
series_I.show()

+------+----+-----+---------------+------+------+
|Series|   X|    Y|_corrupt_record|sum(X)|sum(Y)|
+------+----+-----+---------------+------+------+
|     I|10.0| 8.04|           NULL|  99.0|  82.5|
|     I| 8.0| 6.95|           NULL|  99.0|  82.5|
|     I|13.0| 7.58|           NULL|  99.0|  82.5|
|     I| 9.0| 8.81|           NULL|  99.0|  82.5|
|     I|11.0| 8.33|           NULL|  99.0|  82.5|
|     I|14.0| 9.96|           NULL|  99.0|  82.5|
|     I| 6.0| 7.24|           NULL|  99.0|  82.5|
|     I| 4.0| 4.26|           NULL|  99.0|  82.5|
|     I|12.0|10.84|           NULL|  99.0|  82.5|
|     I| 7.0| 4.81|           NULL|  99.0|  82.5|
|     I| 5.0| 5.68|           NULL|  99.0|  82.5|
+------+----+-----+---------------+------+------+

