In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("lesson_6") \
    .config("spark.executor.instances", 2) \
    .config("spark.executor.memory", f'{int(2000/4.4)}mb') \
    .config("spark.executor.cores", 2) \
    .getOrCreate()

In [31]:
# создаём датафрейм
words = spark.createDataFrame(
    [(1, 'look',), (1, 'spark',), (1, 'tutorial',), (1, 'spark',), (1, 'look', ), (1, 'python', ), (1, 'geek', ), 
     (2, 'brain', ), (2, 'homework', ), (2, 'lesson', ), (2, 'wheel', ), (2, 'pyspark', ), (2, 'session', ), (2, 'pyspark', ), ], 
    ['group', 'word'])
words.show()

+-----+--------+
|group|    word|
+-----+--------+
|    1|    look|
|    1|   spark|
|    1|tutorial|
|    1|   spark|
|    1|    look|
|    1|  python|
|    1|    geek|
|    2|   brain|
|    2|homework|
|    2|  lesson|
|    2|   wheel|
|    2| pyspark|
|    2| session|
|    2| pyspark|
+-----+--------+



#### Задание 1
Выведите все уникальные слова для каждой группы (используйте collect_set).

In [32]:
collect = words.groupby("group")\
               .agg(F.collect_set("word").alias('uniq_words'))

collect.show()

+-----+--------------------+
|group|          uniq_words|
+-----+--------------------+
|    1|[tutorial, geek, ...|
|    2|[homework, wheel,...|
+-----+--------------------+



In [33]:
collect.filter(collect.group == 1).collect()

[Row(group=1, uniq_words=['tutorial', 'geek', 'spark', 'look', 'python'])]

In [34]:
collect.filter(collect.group == 2).collect()

[Row(group=2, uniq_words=['homework', 'wheel', 'session', 'brain', 'lesson', 'pyspark'])]

#### Задание 2
Выведите все уникальные слова для каждой группы (используйте pandas_udf: pyspark.sql.GroupedData.applyInPandas).

In [84]:
from pyspark.sql.types import StringType, ArrayType, StructType, LongType, StructField
import pandas as pd 

def uniq_(key, df_):
    words = df_.word
    obj = list(pd.unique(words))
    return pd.DataFrame([key + (obj,)]) 

schema = StructType([ 
    StructField("group",LongType(),True), 
    StructField("word",ArrayType(StringType()),True), 
  ])

result = words.groupby("group").applyInPandas(uniq_, schema=schema)
result.show()             

+-----+--------------------+
|group|                word|
+-----+--------------------+
|    1|[look, spark, tut...|
|    2|[brain, homework,...|
+-----+--------------------+



In [85]:
result.filter(result.group == 1).collect()

[Row(group=1, word=['look', 'spark', 'tutorial', 'python', 'geek'])]

In [86]:
result.filter(result.group == 2).collect()

[Row(group=2, word=['brain', 'homework', 'lesson', 'wheel', 'pyspark', 'session'])]

#### Задание 3
Вы собрали уникальные слова для каждой группы, теперь на основе полученной таблицы, посчитайте кол-во слов. Есть несколько способов, один из них - Accumulator

In [83]:
words.groupby("word").count().show()

+--------+-----+
|    word|count|
+--------+-----+
| session|    1|
|  lesson|    1|
|tutorial|    1|
|   wheel|    1|
|   spark|    2|
| pyspark|    2|
|homework|    1|
|    look|    2|
|    geek|    1|
|  python|    1|
|   brain|    1|
+--------+-----+

