# Analisi del prezzo delle azioni di Apple

# Inizializziamo Spark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AppleStock").getOrCreate()

# Importiamo il Dataset all'interno di un Dataframe

In [7]:
df = spark.read.csv("data/AAPL.csv", header=True, inferSchema=True)
df.show()

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018|0.531250|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [8]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



# Correggiamo lo schema

In [10]:
from pyspark.sql.types import *

data_schema = [ StructField('Date', TimestampType(), True),
                StructField('Open', FloatType(), True),
                StructField('High', FloatType(), True),
                StructField('Low', FloatType(), True),
                StructField('Close', FloatType(), True),
                StructField('Adj Close', FloatType(), True),
                StructField('Volume', IntegerType(), True),]
            
schema = StructType(fields=data_schema)

df = spark.read.schema(schema).option("header","true").option("inferSchema","false") \
                    .csv("data/AAPL.csv")

df.show() 

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018| 0.53125|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [11]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Adj Close: float (nullable = true)
 |-- Volume: integer (nullable = true)

