In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when,spark_partition_id,count
from pyspark.sql.types import FloatType

In [3]:
spark = SparkSession.builder.appName('dataframe').config('spark.sql.sources.bucketing.enabled',True).getOrCreate()

In [4]:
# action such as 
# Take
# Collect 
# Show

In [5]:
datapath = '/home/amogh/Documents/spark_certification/Spark-The-Definitive-Guide-master/data/'

In [6]:
data = spark.read.parquet(datapath +'clustering/part-r-00000-8891f92d-5542-4aec-a830-0d4ff6f5f871.gz.parquet' )

In [7]:
data.take(2)

[Row(features=DenseVector([3.0, 10.1, 3.0]))]

In [8]:
data.printSchema()

root
 |-- features: vector (nullable = true)



In [9]:
""" 
collect action takes the output of tranformation 
"""

' \ncollect action takes the output of tranformation \n'

In [12]:
data.limit(2).collect()

[Row(features=DenseVector([3.0, 10.1, 3.0]))]

In [13]:
# same does the show



In [16]:
df = spark.read.option('header',True).csv(datapath +'retail-data/all/online-retail-dataset.csv' ).sample(fraction=0.1)

In [17]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [18]:
# columns, rows and adding and renaming columns

In [21]:
df.select(col('Description')).take(1)

[Row(Description='RED COAT RACK PARIS FASHION')]

In [23]:
df.withColumn('red_coat', when(col('Description') == 'RED COAT RACK PARIS FASHION',True).otherwise(False)).take(3)

[Row(InvoiceNo='536368', StockCode='22913', Description='RED COAT RACK PARIS FASHION', Quantity='3', InvoiceDate='12/1/2010 8:34', UnitPrice='4.95', CustomerID='13047', Country='United Kingdom', red_coat=True),
 Row(InvoiceNo='536373', StockCode='20679', Description='EDWARDIAN PARASOL RED', Quantity='6', InvoiceDate='12/1/2010 9:02', UnitPrice='4.95', CustomerID='17850', Country='United Kingdom', red_coat=False),
 Row(InvoiceNo='536373', StockCode='37370', Description='RETRO COFFEE MUGS ASSORTED', Quantity='6', InvoiceDate='12/1/2010 9:02', UnitPrice='1.06', CustomerID='17850', Country='United Kingdom', red_coat=False)]

In [24]:
# Transformation

In [25]:
"""

    Selecting and filtering
    Casting into datatypes
    Repartitioning and Coalescing
    Aggregation


"""

'\n\n    Selecting and filtering\n    Casting into datatypes\n    Repartitioning and Coalescing\n    Aggregation\n\n\n'

In [33]:
df.select('InvoiceNo','Quantity').take(1)

[Row(InvoiceNo='536368', Quantity='3')]

In [36]:
df.select('InvoiceNo',col('Quantity').cast(FloatType())).take(1)


[Row(InvoiceNo='536368', Quantity=3.0)]

In [37]:
df.select('*').limit(3)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: string, InvoiceDate: string, UnitPrice: string, CustomerID: string, Country: string]

In [38]:
df_repartition = df.repartition(col('Quantity'))

In [40]:
df_repartition.select('Quantity',spark_partition_id().alias("pid")).show()

+--------+---+
|Quantity|pid|
+--------+---+
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
|      -4|  0|
+--------+---+
only showing top 20 rows



In [41]:
# group by

In [44]:
df.groupBy('CustomerID').agg(count('InvoiceNo')).sort('CustomerID').take(5)

[Row(CustomerID=None, count(InvoiceNo)=13738),
 Row(CustomerID='12347', count(InvoiceNo)=24),
 Row(CustomerID='12348', count(InvoiceNo)=4),
 Row(CustomerID='12349', count(InvoiceNo)=10),
 Row(CustomerID='12350', count(InvoiceNo)=1)]