In [4]:
sc

In [6]:
# Create a local collection of colors first, a Python list
colors = ['white','green','yellow','red','brown','pink'] 

In [7]:
len(colors)

6

In [9]:
# Distribute the local collection to be an RDD. เก็บที่ worker
color_rdd = sc.parallelize(colors)

In [10]:
type(color_rdd)

pyspark.rdd.RDD

In [11]:
color_rdd.collect()

                                                                                

['white', 'green', 'yellow', 'red', 'brown', 'pink']

In [12]:
# Apply map function on that RDD to get another RDD containing color, length tuples.
keyval_rdd = color_rdd.map(lambda x:(x,len(x)))

In [13]:
keyval_rdd.collect()

                                                                                

[('white', 5),
 ('green', 5),
 ('yellow', 6),
 ('red', 3),
 ('brown', 5),
 ('pink', 4)]

In [15]:
len(keyval_rdd.collect())

6

In [16]:
type(keyval_rdd.collect()[0])

tuple

In [18]:
keyval_rdd.toDF(['Color','Length']).show()

[Stage 6:>                                                          (0 + 1) / 1]

+------+------+
| Color|Length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



                                                                                

In [19]:
color_df = keyval_rdd.toDF(['Color','Length'])
color_df.show()

+------+------+
| Color|Length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



In [20]:
# Check the schema.
color_df.printSchema()

root
 |-- Color: string (nullable = true)
 |-- Length: long (nullable = true)



In [21]:
# Check row count. 
color_df.count()

6

In [22]:
# Look at the table contents. You can limit displayed rows by passing parameter to show.
color_df.show(2)

+-----+------+
|Color|Length|
+-----+------+
|white|     5|
|green|     5|
+-----+------+
only showing top 2 rows



In [25]:
# List out column names. 
color_df.columns

['Color', 'Length']

In [24]:
# Drop a column. The source DataFrame color_df remains the same.
color_df.drop('Length').show() 

+------+
| Color|
+------+
| white|
| green|
|yellow|
|   red|
| brown|
|  pink|
+------+



In [30]:
# Selects the colors having a length of four or five only and label the column as "mid_length" filter.
color_df.filter(color_df['Length'].between(4,5))\
.select(color_df.Color.alias("mid_length")).show()

+----------+
|mid_length|
+----------+
|     white|
|     green|
|     brown|
|      pink|
+----------+



In [36]:
# filter. 2 ครั้ง
# This example uses multiple filter criteria.
color_df.filter(color_df.Length > 4)\
.filter(color_df[0]!="white").show() #color_df[0] = Column<'Color'>

+------+------+
| Color|Length|
+------+------+
| green|     5|
|yellow|     6|
| brown|     5|
+------+------+



In [37]:
# filter. 2 ครั้ง
#color_df[0] = Column<'Color'>
color_df.filter(color_df.Length > 4)\
.filter(color_df['Color']!="white").show() 

+------+------+
| Color|Length|
+------+------+
| green|     5|
|yellow|     6|
| brown|     5|
+------+------+



In [38]:
# The another style is same as above. เหมือนกัน กับข้างบน filter. 2 ครั้ง
color_df.filter((color_df.Length > 4) & (color_df[0]!="white")).show()

+------+------+
| Color|Length|
+------+------+
| green|     5|
|yellow|     6|
| brown|     5|
+------+------+



In [49]:
# The filtered rows are sorted on the column length in descending order. (มากไปน้อย)
color_df\
.filter(color_df['Length']>=4)\
.sort("Length", 'Color',ascending=False).show()

+------+------+
| Color|Length|
+------+------+
|yellow|     6|
| white|     5|
| green|     5|
| brown|     5|
|  pink|     4|
+------+------+



In [57]:
# You can use orderBy instead. 
color_df\
.filter(color_df['Length']>=4)\
.orderBy(color_df.Length.desc(),color_df.Color.desc())\
.show()

+------+------+
| Color|Length|
+------+------+
|yellow|     6|
| white|     5|
| green|     5|
| brown|     5|
|  pink|     4|
+------+------+



In [55]:
# You can use orderBy instead. 
color_df\
.filter(color_df['Length']>=4)\
.select(color_df.Color)\
.orderBy(color_df.Length.desc(),color_df.Color.asc())\
.show()

+------+
| Color|
+------+
|yellow|
| brown|
| green|
| white|
|  pink|
+------+



In [54]:
# Alternative syntax, for single or multiple columns.
color_df.sort(\
color_df.Length.desc(),color_df.Color.asc()).show()

+------+------+
| Color|Length|
+------+------+
|yellow|     6|
| brown|     5|
| green|     5|
| white|     5|
|  pink|     4|
|   red|     3|
+------+------+



In [60]:
# GroupBy
color_df.groupBy('Length').count().show()

+------+-----+
|Length|count|
+------+-----+
|     6|    1|
|     5|    3|
|     3|    1|
|     4|    1|
+------+-----+



In [64]:
color_df.select('Length','Color').groupBy('Length').count().show()

+------+-----+
|Length|count|
+------+-----+
|     6|    1|
|     5|    3|
|     3|    1|
|     4|    1|
+------+-----+



In [65]:
# Basic Stat. 
color_df.describe().show()



+-------+------+------------------+
|summary| Color|            Length|
+-------+------+------------------+
|  count|     6|                 6|
|   mean|  null| 4.666666666666667|
| stddev|  null|1.0327955589886444|
|    min| brown|                 3|
|    max|yellow|                 6|
+-------+------+------------------+



                                                                                