## Inizializzazione di Spark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("basic").getOrCreate()

In [2]:
data = [("Giuseppe", "M", 23, 174, 70.5),
        ("Antonio", "M", 25, 179, 68.),
        ("Lorenzo", "M", 33, 172, 88.5),
        ("Luisa", "F", 48, 155, 50.2),
        ("Margheria", "F", 35, 165, 54.3)]

df = spark.createDataFrame(data, ["name", "gender", "age", "height", "weight"])

In [3]:
df.show()

+---------+------+---+------+------+
|     name|gender|age|height|weight|
+---------+------+---+------+------+
| Giuseppe|     M| 23|   174|  70.5|
|  Antonio|     M| 25|   179|  68.0|
|  Lorenzo|     M| 33|   172|  88.5|
|    Luisa|     F| 48|   155|  50.2|
|Margheria|     F| 35|   165|  54.3|
+---------+------+---+------+------+



In [4]:
df.count()

5

In [5]:
df.columns

['name', 'gender', 'age', 'height', 'weight']

In [6]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: long (nullable = true)
 |-- weight: double (nullable = true)



In [7]:
df_desc = df.describe()
type(df_desc)

pyspark.sql.dataframe.DataFrame

In [8]:
df_desc.show()

+-------+---------+------+-----------------+-----------------+------------------+
|summary|     name|gender|              age|           height|            weight|
+-------+---------+------+-----------------+-----------------+------------------+
|  count|        5|     5|                5|                5|                 5|
|   mean|     null|  null|             32.8|            169.0|              66.3|
| stddev|     null|  null|9.909591313469996|9.300537618869138|15.137536127124523|
|    min|  Antonio|     F|               23|              155|              50.2|
|    max|Margheria|     M|               48|              179|              88.5|
+-------+---------+------+-----------------+-----------------+------------------+



In [9]:
from pyspark.sql.types import *

data_schema = [StructField('name', StringType(), True),
               StructField('gender', StringType(), True),
               StructField('age', IntegerType(), True),
               StructField('height', IntegerType(), True),
               StructField('height', FloatType(), True)
              ]

schema = StructType(fields=data_schema)

In [10]:
sf = spark.createDataFrame(data, schema=schema)
sf.show()

+---------+------+---+------+------+
|     name|gender|age|height|height|
+---------+------+---+------+------+
| Giuseppe|     M| 23|   174|  70.5|
|  Antonio|     M| 25|   179|  68.0|
|  Lorenzo|     M| 33|   172|  88.5|
|    Luisa|     F| 48|   155|  50.2|
|Margheria|     F| 35|   165|  54.3|
+---------+------+---+------+------+



In [11]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: long (nullable = true)
 |-- weight: double (nullable = true)



# Righe e colonne

In [13]:
df.head(5)

[Row(name='Giuseppe', gender='M', age=23, height=174, weight=70.5),
 Row(name='Antonio', gender='M', age=25, height=179, weight=68.0),
 Row(name='Lorenzo', gender='M', age=33, height=172, weight=88.5),
 Row(name='Luisa', gender='F', age=48, height=155, weight=50.2),
 Row(name='Margheria', gender='F', age=35, height=165, weight=54.3)]

In [14]:
df[0]

Column<b'name'>

In [15]:
df['name']

Column<b'name'>

In [16]:
dfName = df.select("name")
dfName.show()

+---------+
|     name|
+---------+
| Giuseppe|
|  Antonio|
|  Lorenzo|
|    Luisa|
|Margheria|
+---------+



In [17]:
df.select(["name", "age"]).show()

+---------+---+
|     name|age|
+---------+---+
| Giuseppe| 23|
|  Antonio| 25|
|  Lorenzo| 33|
|    Luisa| 48|
|Margheria| 35|
+---------+---+



In [19]:
df = df.withColumn("height", df["height"]/100)
df.show()

+---------+------+---+------+------+
|     name|gender|age|height|weight|
+---------+------+---+------+------+
| Giuseppe|     M| 23|  1.74|  70.5|
|  Antonio|     M| 25|  1.79|  68.0|
|  Lorenzo|     M| 33|  1.72|  88.5|
|    Luisa|     F| 48|  1.55|  50.2|
|Margheria|     F| 35|  1.65|  54.3|
+---------+------+---+------+------+



In [20]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: double (nullable = true)
 |-- weight: double (nullable = true)



In [21]:
bmi = df["weight"]/(df["height"]**2)
df = df.withColumn("bmi", bmi)
df.show()

+---------+------+---+------+------+------------------+
|     name|gender|age|height|weight|               bmi|
+---------+------+---+------+------+------------------+
| Giuseppe|     M| 23|  1.74|  70.5|23.285770907649624|
|  Antonio|     M| 25|  1.79|  68.0| 21.22280827689523|
|  Lorenzo|     M| 33|  1.72|  88.5| 29.91481882098432|
|    Luisa|     F| 48|  1.55|  50.2|20.894901144640997|
|Margheria|     F| 35|  1.65|  54.3| 19.94490358126722|
+---------+------+---+------+------+------------------+



In [25]:
from pyspark.sql.functions import round

df = df.withColumn("bmi", round(df["bmi"], 2))
df.show()

+---------+------+---+------+------+-----+
|     name|gender|age|height|weight|  bmi|
+---------+------+---+------+------+-----+
| Giuseppe|     M| 23|  1.74|  70.5|23.29|
|  Antonio|     M| 25|  1.79|  68.0|21.22|
|  Lorenzo|     M| 33|  1.72|  88.5|29.91|
|    Luisa|     F| 48|  1.55|  50.2|20.89|
|Margheria|     F| 35|  1.65|  54.3|19.94|
+---------+------+---+------+------+-----+



In [26]:
from pyspark.sql.functions import when

df = df.withColumn("is_fat", when(df["bmi"]>25, True).otherwise(False))
df.show()

+---------+------+---+------+------+-----+------+
|     name|gender|age|height|weight|  bmi|is_fat|
+---------+------+---+------+------+-----+------+
| Giuseppe|     M| 23|  1.74|  70.5|23.29| false|
|  Antonio|     M| 25|  1.79|  68.0|21.22| false|
|  Lorenzo|     M| 33|  1.72|  88.5|29.91|  true|
|    Luisa|     F| 48|  1.55|  50.2|20.89| false|
|Margheria|     F| 35|  1.65|  54.3|19.94| false|
+---------+------+---+------+------+-----+------+



In [27]:
df = df.withColumnRenamed("gender", "sex")
df.show()

+---------+---+---+------+------+-----+------+
|     name|sex|age|height|weight|  bmi|is_fat|
+---------+---+---+------+------+-----+------+
| Giuseppe|  M| 23|  1.74|  70.5|23.29| false|
|  Antonio|  M| 25|  1.79|  68.0|21.22| false|
|  Lorenzo|  M| 33|  1.72|  88.5|29.91|  true|
|    Luisa|  F| 48|  1.55|  50.2|20.89| false|
|Margheria|  F| 35|  1.65|  54.3|19.94| false|
+---------+---+---+------+------+-----+------+



# Filtri 

In [29]:
df_male = df.filter("sex=='M'")
df_male.show()

+--------+---+---+------+------+-----+------+
|    name|sex|age|height|weight|  bmi|is_fat|
+--------+---+---+------+------+-----+------+
|Giuseppe|  M| 23|  1.74|  70.5|23.29| false|
| Antonio|  M| 25|  1.79|  68.0|21.22| false|
| Lorenzo|  M| 33|  1.72|  88.5|29.91|  true|
+--------+---+---+------+------+-----+------+



In [30]:
df_male = df.filter(df["sex"]=='M')
df_male.show()

+--------+---+---+------+------+-----+------+
|    name|sex|age|height|weight|  bmi|is_fat|
+--------+---+---+------+------+-----+------+
|Giuseppe|  M| 23|  1.74|  70.5|23.29| false|
| Antonio|  M| 25|  1.79|  68.0|21.22| false|
| Lorenzo|  M| 33|  1.72|  88.5|29.91|  true|
+--------+---+---+------+------+-----+------+



# Aggregazione

In [34]:
df_group = df.groupBy("sex")

In [35]:
type(df_group)

pyspark.sql.group.GroupedData

In [36]:
df_group.count().show()

+---+-----+
|sex|count|
+---+-----+
|  F|    2|
|  M|    3|
+---+-----+



In [37]:
df_group.mean().show()

+---+--------+-----------+-----------------+-----------------+
|sex|avg(age)|avg(height)|      avg(weight)|         avg(bmi)|
+---+--------+-----------+-----------------+-----------------+
|  F|    41.5|        1.6|            52.25|           20.415|
|  M|    27.0|       1.75|75.66666666666667|24.80666666666667|
+---+--------+-----------+-----------------+-----------------+



In [39]:
df_group.sum().show()

+---+--------+-----------+-----------+--------+
|sex|sum(age)|sum(height)|sum(weight)|sum(bmi)|
+---+--------+-----------+-----------+--------+
|  F|      83|        3.2|      104.5|   40.83|
|  M|      81|       5.25|      227.0|   74.42|
+---+--------+-----------+-----------+--------+



In [40]:
df_group.max().show()

+---+--------+-----------+-----------+--------+
|sex|max(age)|max(height)|max(weight)|max(bmi)|
+---+--------+-----------+-----------+--------+
|  F|      48|       1.65|       54.3|   20.89|
|  M|      33|       1.79|       88.5|   29.91|
+---+--------+-----------+-----------+--------+



In [41]:
df_group.min().show()

+---+--------+-----------+-----------+--------+
|sex|min(age)|min(height)|min(weight)|min(bmi)|
+---+--------+-----------+-----------+--------+
|  F|      35|       1.55|       50.2|   19.94|
|  M|      23|       1.72|       68.0|   21.22|
+---+--------+-----------+-----------+--------+



In [42]:
df.agg({'weight':'sum'}).show()

+-----------+
|sum(weight)|
+-----------+
|      331.5|
+-----------+



In [45]:
df_group.agg({'weight':'sum', 'height':'max', 'sex':'count'}) \
    .withColumnRenamed("count(sex)", "count_sex") \
    .withColumnRenamed("sum(weight)", "sum_weight") \
    .withColumnRenamed("max(height)", "max_height") \
    .show()

+---+---------+----------+----------+
|sex|count_sex|sum_weight|max_height|
+---+---------+----------+----------+
|  F|        2|     104.5|      1.65|
|  M|        3|     227.0|      1.79|
+---+---------+----------+----------+



In [46]:
from pyspark.sql.functions import sum, max, count

df_group.agg(sum("weight"), max("height"), count("sex")).show()

+---+-----------+-----------+----------+
|sex|sum(weight)|max(height)|count(sex)|
+---+-----------+-----------+----------+
|  F|      104.5|       1.65|         2|
|  M|      227.0|       1.79|         3|
+---+-----------+-----------+----------+



In [47]:
df_group.agg(sum("weight").alias("sum_weight"), \
             max("height").alias("max_height"), \
             count("sex").alias("count_sex")) \
            .show()

+---+----------+----------+---------+
|sex|sum_weight|max_height|count_sex|
+---+----------+----------+---------+
|  F|     104.5|      1.65|        2|
|  M|     227.0|      1.79|        3|
+---+----------+----------+---------+



# Ordinamento

In [48]:
df.orderBy("weight").show()

+---------+---+---+------+------+-----+------+
|     name|sex|age|height|weight|  bmi|is_fat|
+---------+---+---+------+------+-----+------+
|    Luisa|  F| 48|  1.55|  50.2|20.89| false|
|Margheria|  F| 35|  1.65|  54.3|19.94| false|
|  Antonio|  M| 25|  1.79|  68.0|21.22| false|
| Giuseppe|  M| 23|  1.74|  70.5|23.29| false|
|  Lorenzo|  M| 33|  1.72|  88.5|29.91|  true|
+---------+---+---+------+------+-----+------+

