# Analisi del prezzo delle azioni di Apple

# Inizializziamo Spark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AppleStock").getOrCreate()

# Importiamo il Dataset all'interno di un Dataframe

In [7]:
df = spark.read.csv("data/AAPL.csv", header=True, inferSchema=True)
df.show()

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018|0.531250|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [8]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



# Correggiamo lo schema

In [10]:
from pyspark.sql.types import *

data_schema = [ StructField('Date', TimestampType(), True),
                StructField('Open', FloatType(), True),
                StructField('High', FloatType(), True),
                StructField('Low', FloatType(), True),
                StructField('Close', FloatType(), True),
                StructField('Adj Close', FloatType(), True),
                StructField('Volume', IntegerType(), True),]
            
schema = StructType(fields=data_schema)

df = spark.read.schema(schema).option("header","true").option("inferSchema","false") \
                    .csv("data/AAPL.csv")

df.show() 

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018| 0.53125|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [11]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Adj Close: float (nullable = true)
 |-- Volume: integer (nullable = true)



# Convertiamo il timestamp in una data

In [14]:
from pyspark.sql.functions import to_date

df = df.withColumn('Date', to_date(df["Date"], "yyyy-MM-dd"))
df.show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1980-12-12|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22|0.529018| 0.53125|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26|0.633929|0.636161|0.633929|0.633929| 0.506909| 13893600|
|1980-12-29|0.642857|0.645089|0.642857|0.642857| 0.514049| 23290400|
|1980-12-30|0.629464|0.629464|0.62

# Qual è stato il valore massimo raggiunto dal AAPL? In che data lo ha raggiunto?

In [15]:
df.orderBy("High", ascending=False).show(10)

+----------+------+------+------+------+---------+--------+
|      Date|  Open|  High|   Low| Close|Adj Close|  Volume|
+----------+------+------+------+------+---------+--------+
|2018-10-03|230.05|233.47|229.78|232.07|229.39209|28654800|
|2018-10-04|230.78|232.35|226.73|227.99|225.35918|32042000|
|2018-10-02|227.25| 230.0|226.63|229.28| 226.6343|24788200|
|2018-09-05|228.99|229.67| 225.1|226.87|224.25209|33333000|
|2018-10-01|227.95|229.42|226.35|227.26| 224.6376|23600800|
|2018-09-04|228.41|229.18|226.63|228.36| 225.7249|27390100|
|2018-08-31|226.51|228.87| 226.0|227.63|225.00334|43340100|
|2018-10-05|227.96|228.41|220.58|224.29|221.70186|33580500|
|2018-09-13|223.52|228.35|222.57|226.41|223.79741|41706400|
|2018-08-30|223.25|228.26| 222.4|225.03|222.43332|48793800|
+----------+------+------+------+------+---------+--------+
only showing top 10 rows



# Qual è stato il valore massimo raggiunto dal AAPL dopo il 2000? In che data lo ha raggiunto?

In [17]:
df.filter("Date >= '2000-01-01'").orderBy("Low").show(10)

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|2003-04-17|0.942857|0.946429|0.908571|0.937143| 0.820964|154064400|
|2003-04-16|0.927857|0.976429|0.922857|0.945714| 0.828473|254044000|
|2003-04-11|1.003571|1.031429|0.923571|0.942857|  0.82597|348177200|
|2003-04-21|0.937857|0.942143|0.927143|0.938571| 0.822216| 38080000|
|2003-04-24|0.965714|0.972143|0.928571|    0.96| 0.840988| 81277000|
|2003-04-22|0.941429|0.972857|   0.935|   0.965| 0.845368| 75142200|
|2003-04-25|0.961429|    0.97|   0.945|0.953571| 0.835356| 51329600|
|2003-04-15|0.970714|0.971429|    0.95|0.956429| 0.837859| 75992000|
|2002-10-08|0.992857|0.997143|0.954286|0.977143| 0.856006|113411200|
|2003-04-23|0.966429|0.973571|0.954286|    0.97| 0.849748| 52420200|
+----------+--------+--------+--------+--------+---------+---------+
only showing top 10 rows

