## Inizializzazione di Spark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("basic").getOrCreate()

In [3]:
data = [("Giuseppe", "M", 23, 174, 70.5),
        ("Antonio", "M", 25, 179, 68.),
        ("Lorenzo", "M", 33, 172, 88.5),
        ("Luisa", "F", 48, 155, 50.2),
        ("Margheria", "F", 35, 165, 54.3)]

df = spark.createDataFrame(data, ["name", "gender", "age", "height", "weight"])

In [4]:
df.show()

+---------+------+---+------+------+
|     name|gender|age|height|weight|
+---------+------+---+------+------+
| Giuseppe|     M| 23|   174|  70.5|
|  Antonio|     M| 25|   179|  68.0|
|  Lorenzo|     M| 33|   172|  88.5|
|    Luisa|     F| 48|   155|  50.2|
|Margheria|     F| 35|   165|  54.3|
+---------+------+---+------+------+



In [7]:
df.count()

5

In [8]:
df.columns

['name', 'gender', 'age', 'height', 'weight']

In [9]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: long (nullable = true)
 |-- weight: double (nullable = true)



In [10]:
df_desc = df.describe()
type(df_desc)

pyspark.sql.dataframe.DataFrame

In [12]:
df_desc.show()

+-------+---------+------+-----------------+-----------------+------------------+
|summary|     name|gender|              age|           height|            weight|
+-------+---------+------+-----------------+-----------------+------------------+
|  count|        5|     5|                5|                5|                 5|
|   mean|     null|  null|             32.8|            169.0|              66.3|
| stddev|     null|  null|9.909591313469996|9.300537618869138|15.137536127124523|
|    min|  Antonio|     F|               23|              155|              50.2|
|    max|Margheria|     M|               48|              179|              88.5|
+-------+---------+------+-----------------+-----------------+------------------+



In [15]:
from pyspark.sql.types import *

data_schema = [StructField('name', StringType(), True),
               StructField('gender', StringType(), True),
               StructField('age', IntegerType(), True),
               StructField('height', IntegerType(), True),
               StructField('height', FloatType(), True)
              ]

schema = StructType(fields=data_schema)

In [16]:
sf = spark.createDataFrame(data, schema=schema)
sf.show()

+---------+------+---+------+------+
|     name|gender|age|height|height|
+---------+------+---+------+------+
| Giuseppe|     M| 23|   174|  70.5|
|  Antonio|     M| 25|   179|  68.0|
|  Lorenzo|     M| 33|   172|  88.5|
|    Luisa|     F| 48|   155|  50.2|
|Margheria|     F| 35|   165|  54.3|
+---------+------+---+------+------+



In [17]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: long (nullable = true)
 |-- weight: double (nullable = true)

