## Boiler plate spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField, StructType,
                                IntegerType, StringType)

spark = SparkSession.builder.appName("friends_df_ex").getOrCreate()

### bioler plate schema

In [24]:
data_schema = [
                StructField(name="ID", dataType=IntegerType(), nullable=True),
                StructField(name="name", dataType=StringType(), nullable=True),
                StructField(name="age", dataType=IntegerType(), nullable=True),
                StructField(name="num_friends", dataType=IntegerType(), nullable=True)
            ]
final_struct = StructType(data_schema)

In [25]:
people_df = spark.read.format("csv") \
            .schema(final_struct) \
            .option("header", "true") \
            .load("file:///var/lib/spark/jupyter/data/fakefriends.csv")

### Its important to select only the related column at the start for efficiency

In [45]:
people_df \
    .select(people_df.age, people_df.num_friends) \
    .groupBy("age") \
    .avg("num_friends") \
    .show()

+---+------------------+
|age|  avg(num_friends)|
+---+------------------+
| 31|            267.25|
| 65|             298.2|
| 53|222.85714285714286|
| 34|             245.5|
| 28|             209.1|
| 26|242.05882352941177|
| 27|           228.125|
| 44| 282.1666666666667|
| 22|206.42857142857142|
| 47|233.22222222222223|
| 52| 340.6363636363636|
| 40| 250.8235294117647|
| 20|             165.0|
| 57| 258.8333333333333|
| 54| 278.0769230769231|
| 48|             281.4|
| 19|213.27272727272728|
| 64| 281.3333333333333|
| 41|268.55555555555554|
| 43|230.57142857142858|
+---+------------------+
only showing top 20 rows



### clumping multiple commands using agg

In [47]:
from pyspark.sql import functions as f
people_df \
    .select(people_df.age, people_df.num_friends) \
    .groupBy("age") \
    .agg(f.round(f.avg("num_friends"), 2).alias("friends_avg")) \
    .show()

+---+-----------+
|age|friends_avg|
+---+-----------+
| 31|     267.25|
| 65|      298.2|
| 53|     222.86|
| 34|      245.5|
| 28|      209.1|
| 26|     242.06|
| 27|     228.13|
| 44|     282.17|
| 22|     206.43|
| 47|     233.22|
| 52|     340.64|
| 40|     250.82|
| 20|      165.0|
| 57|     258.83|
| 54|     278.08|
| 48|      281.4|
| 19|     213.27|
| 64|     281.33|
| 41|     268.56|
| 43|     230.57|
+---+-----------+
only showing top 20 rows

