In [1]:
colors = ['white','green','yellow','red','brown','pink'] 
color_rdd = sc.parallelize(colors)

In [2]:
keyval_rdd = color_rdd.map(lambda x: (x, len(x)))

In [3]:
color_df = keyval_rdd.toDF(['color', 'length'])

                                                                                

In [4]:
color_df

DataFrame[color: string, length: bigint]

In [5]:
color_df.printSchema()

root
 |-- color: string (nullable = true)
 |-- length: long (nullable = true)



In [6]:
color_df.count()

                                                                                

6

In [7]:
color_df.show(2)

+-----+------+
|color|length|
+-----+------+
|white|     5|
|green|     5|
+-----+------+
only showing top 2 rows



In [8]:
color_df.columns

['color', 'length']

In [9]:
color_df.drop('length').show()

+------+
| color|
+------+
| white|
| green|
|yellow|
|   red|
| brown|
|  pink|
+------+



In [10]:
color_df.filter(
    color_df.length.between(4, 5)
).select(
    color_df.color.alias('mid_length')
).show()

+----------+
|mid_length|
+----------+
|     white|
|     green|
|     brown|
|      pink|
+----------+



In [11]:
color_df.filter(
    color_df.length > 4
).filter(
    color_df[0] != "white"
).show()

+------+------+
| color|length|
+------+------+
| green|     5|
|yellow|     6|
| brown|     5|
+------+------+



In [12]:
color_df.filter(
    (color_df.length > 4) &
    (color_df[0]!="white")
).show()

+------+------+
| color|length|
+------+------+
| green|     5|
|yellow|     6|
| brown|     5|
+------+------+



In [13]:
color_df.filter(
    color_df['length'] >= 4
).sort(
    'length', 'color', 
    ascending=False
).show()

+------+------+
| color|length|
+------+------+
|yellow|     6|
| white|     5|
| green|     5|
| brown|     5|
|  pink|     4|
+------+------+



In [14]:
color_df.orderBy('length','color').take(4)

[Row(color='red', length=3),
 Row(color='pink', length=4),
 Row(color='brown', length=5),
 Row(color='green', length=5)]

In [15]:
color_df.sort(
    color_df.length.desc(),
    color_df.color.asc()
).show()

+------+------+
| color|length|
+------+------+
|yellow|     6|
| brown|     5|
| green|     5|
| white|     5|
|  pink|     4|
|   red|     3|
+------+------+



In [16]:
color_df.groupBy('length').count().show()

+------+-----+
|length|count|
+------+-----+
|     6|    1|
|     5|    3|
|     3|    1|
|     4|    1|
+------+-----+



In [17]:
color_df.describe().show()

+-------+------+------------------+
|summary| color|            length|
+-------+------+------------------+
|  count|     6|                 6|
|   mean|  null| 4.666666666666667|
| stddev|  null|1.0327955589886444|
|    min| brown|                 3|
|    max|yellow|                 6|
+-------+------+------------------+



# Max-min Normalize

In [18]:
import pyspark.sql.functions as F

In [24]:
df = color_df.agg(
    F.max(color_df.length),
    F.min(color_df.length),
)

In [26]:
maxLen = df.collect()[0][0]
minLen = df.collect()[0][1]
print(maxLen, minLen)

6 3


In [28]:
maxMinNormalize = F.udf(lambda x: (x - minLen)/(maxLen - minLen))

In [32]:
(
    color_df
    .withColumn(
        'maxMinNormalized_legnth',
        maxMinNormalize(color_df.length)
    )
    .show()
)

+------+------+-----------------------+
| color|length|maxMinNormalized_legnth|
+------+------+-----------------------+
| white|     5|     0.6666666666666666|
| green|     5|     0.6666666666666666|
|yellow|     6|                    1.0|
|   red|     3|                    0.0|
| brown|     5|     0.6666666666666666|
|  pink|     4|     0.3333333333333333|
+------+------+-----------------------+

