# Analisi del prezzo delle azioni di Apple

# Inizializziamo Spark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AppleStock").getOrCreate()

# Importiamo il Dataset all'interno di un Dataframe

In [2]:
df = spark.read.csv("data/AAPL.csv", header=True, inferSchema=True)
df.show()

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018|0.531250|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [3]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



# Correggiamo lo schema

In [4]:
from pyspark.sql.types import *

data_schema = [ StructField('Date', TimestampType(), True),
                StructField('Open', FloatType(), True),
                StructField('High', FloatType(), True),
                StructField('Low', FloatType(), True),
                StructField('Close', FloatType(), True),
                StructField('Adj Close', FloatType(), True),
                StructField('Volume', IntegerType(), True),]
            
schema = StructType(fields=data_schema)

df = spark.read.schema(schema).option("header","true").option("inferSchema","false") \
                    .csv("data/AAPL.csv")

df.show() 

+-------------------+--------+--------+--------+--------+---------+---------+
|               Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+-------------------+--------+--------+--------+--------+---------+---------+
|1980-12-12 00:00:00|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15 00:00:00|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16 00:00:00|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17 00:00:00|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18 00:00:00|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19 00:00:00|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22 00:00:00|0.529018| 0.53125|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23 00:00:00|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24 00:00:00|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26 00:00:00|0.633929|0.636161|0.633929|0.633929| 0.5069

In [5]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Adj Close: float (nullable = true)
 |-- Volume: integer (nullable = true)



# Convertiamo il timestamp in una data

In [6]:
from pyspark.sql.functions import to_date

df = df.withColumn('Date', to_date(df["Date"], "yyyy-MM-dd"))
df.show()

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|1980-12-12|0.513393|0.515625|0.513393|0.513393| 0.410525|117258400|
|1980-12-15|0.488839|0.488839|0.486607|0.486607| 0.389106| 43971200|
|1980-12-16|0.453125|0.453125|0.450893|0.450893| 0.360548| 26432000|
|1980-12-17|0.462054|0.464286|0.462054|0.462054| 0.369472| 21610400|
|1980-12-18|0.475446|0.477679|0.475446|0.475446| 0.380182| 18362400|
|1980-12-19|0.504464|0.506696|0.504464|0.504464| 0.403385| 12157600|
|1980-12-22|0.529018| 0.53125|0.529018|0.529018| 0.423019|  9340800|
|1980-12-23|0.551339|0.553571|0.551339|0.551339| 0.440868| 11737600|
|1980-12-24|0.580357|0.582589|0.580357|0.580357| 0.464072| 12000800|
|1980-12-26|0.633929|0.636161|0.633929|0.633929| 0.506909| 13893600|
|1980-12-29|0.642857|0.645089|0.642857|0.642857| 0.514049| 23290400|
|1980-12-30|0.629464|0.629464|0.62

# Qual è stato il valore massimo raggiunto dal AAPL? In che data lo ha raggiunto?

In [7]:
df.orderBy("High", ascending=False).show(10)

+----------+------+------+------+------+---------+--------+
|      Date|  Open|  High|   Low| Close|Adj Close|  Volume|
+----------+------+------+------+------+---------+--------+
|2018-10-03|230.05|233.47|229.78|232.07|229.39209|28654800|
|2018-10-04|230.78|232.35|226.73|227.99|225.35918|32042000|
|2018-10-02|227.25| 230.0|226.63|229.28| 226.6343|24788200|
|2018-09-05|228.99|229.67| 225.1|226.87|224.25209|33333000|
|2018-10-01|227.95|229.42|226.35|227.26| 224.6376|23600800|
|2018-09-04|228.41|229.18|226.63|228.36| 225.7249|27390100|
|2018-08-31|226.51|228.87| 226.0|227.63|225.00334|43340100|
|2018-10-05|227.96|228.41|220.58|224.29|221.70186|33580500|
|2018-09-13|223.52|228.35|222.57|226.41|223.79741|41706400|
|2018-08-30|223.25|228.26| 222.4|225.03|222.43332|48793800|
+----------+------+------+------+------+---------+--------+
only showing top 10 rows



# Qual è stato il valore massimo raggiunto dal AAPL dopo il 2000? In che data lo ha raggiunto?

In [8]:
df.filter("Date >= '2000-01-01'").orderBy("Low").show(10)

+----------+--------+--------+--------+--------+---------+---------+
|      Date|    Open|    High|     Low|   Close|Adj Close|   Volume|
+----------+--------+--------+--------+--------+---------+---------+
|2003-04-17|0.942857|0.946429|0.908571|0.937143| 0.820964|154064400|
|2003-04-16|0.927857|0.976429|0.922857|0.945714| 0.828473|254044000|
|2003-04-11|1.003571|1.031429|0.923571|0.942857|  0.82597|348177200|
|2003-04-21|0.937857|0.942143|0.927143|0.938571| 0.822216| 38080000|
|2003-04-24|0.965714|0.972143|0.928571|    0.96| 0.840988| 81277000|
|2003-04-22|0.941429|0.972857|   0.935|   0.965| 0.845368| 75142200|
|2003-04-25|0.961429|    0.97|   0.945|0.953571| 0.835356| 51329600|
|2003-04-15|0.970714|0.971429|    0.95|0.956429| 0.837859| 75992000|
|2002-10-08|0.992857|0.997143|0.954286|0.977143| 0.856006|113411200|
|2003-04-23|0.966429|0.973571|0.954286|    0.97| 0.849748| 52420200|
+----------+--------+--------+--------+--------+---------+---------+
only showing top 10 rows



# Qual è la percentuale di giorni in cui il prezzo di chiusura è stato inferiore ai 100 USD?

In [9]:
round(df.filter("Close < 100").count()/df.count()*100, 2)

89.67

# Qual è la percentuale di giorni in cui il prezzo di chiusura è stato inferiore ai 100 USD?

In [10]:
df2014 = df.filter("Date >= '2014-01-01'")

In [11]:
round(df2014.filter("Close < 100").count()/df2014.count()*100, 2)

21.54

# Visualizza il valore massimo per anno

In [15]:
from pyspark.sql.functions import year

dfGroupYear = df.groupBy(year("Date").alias("year"))

In [16]:
from pyspark.sql.functions import max

dfHigh = dfGroupYear.agg(max("High"))
dfHigh.orderBy("year", ascending=False).show(39)

+----+---------+
|year|max(High)|
+----+---------+
|2018|   233.47|
|2017|    177.2|
|2016|   118.69|
|2015|   134.54|
|2014|   119.75|
|2013| 82.16286|
|2012|100.72429|
|2011| 60.95714|
|2010|46.665714|
|2009|30.564285|
|2008|28.608572|
|2007|28.994286|
|2006|13.308572|
|2005|    10.78|
|2004| 4.969285|
|2003| 1.786429|
|2002| 1.869286|
|2001| 1.937143|
|2000| 5.370536|
|1999| 4.214286|
|1998|   1.5625|
|1997| 1.055804|
|1996| 1.267857|
|1995| 1.790179|
|1994|   1.5625|
|1993| 2.330357|
|1992|      2.5|
|1991| 2.616071|
|1990| 1.705357|
|1989| 1.799107|
|1988| 1.705357|
|1987| 2.133929|
|1986| 0.783482|
|1985| 0.555804|
|1984| 0.613839|
|1983| 1.129464|
|1982| 0.622768|
|1981| 0.620536|
|1980| 0.645089|
+----+---------+
only showing top 39 rows



# In quale hanno sono state scambiate più azioni di Apple?

In [17]:
from pyspark.sql.functions import sum

dfVolume = dfGroupYear.agg(sum("Volume").alias("Volume"))
dfVolume.orderBy("Volume", ascending=False).show(39)

+----+-----------+
|year|     Volume|
+----+-----------+
|2008|71495301500|
|2007|61748996400|
|2006|53924741500|
|2005|45600245600|
|2010|37756231800|
|2009|35813421700|
|1999|34275676400|
|2012|32991051100|
|2011|31014834900|
|2004|30450417200|
|2000|30075399200|
|1998|28798548800|
|2013|25605392400|
|2001|23664449200|
|2002|19253481800|
|1995|18566634800|
|1997|17990840000|
|2003|17807563200|
|2014|15914488100|
|1987|14942827200|
|1991|14336912800|
|1994|14288974000|
|1993|14113232000|
|1986|13330805600|
|1996|13298555200|
|2015|13066049900|
|1989|12726456400|
|1985|11373068000|
|1983|11128252800|
|1990|11100485200|
|1984|10494758400|
|1988|10323244400|
|1992|10284478400|
|2016| 9682477800|
|2018| 8539036200|
|2017| 6810776500|
|1982| 5341252000|
|1981| 2049236000|
|1980|  336212800|
+----+-----------+
only showing top 39 rows



# Il 29 giugno 2007 è stato rilasciato al pubblico il primo IPhone, come è variato il prezzo delle azioni di Apple nei 180 giorni successivi?

In [19]:
from datetime import datetime, timedelta

start_date = '2007-06-29'
end_date = datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=180)
end_date = datetime.strftime(end_date, '%Y-%m-%d') 

print(end_date)

2007-12-26


In [23]:
rowFirst = df.filter("Date == '"+start_date+"'").head()
rowFirst

Row(Date=datetime.date(2007, 6, 29), Open=17.424285888671875, High=17.714284896850586, Low=17.29857063293457, Close=17.43428611755371, Adj Close=15.272941589355469, Volume=284460400)

In [24]:
dictFirst = rowFirst.asDict()
dictFirst

{'Date': datetime.date(2007, 6, 29),
 'Open': 17.424285888671875,
 'High': 17.714284896850586,
 'Low': 17.29857063293457,
 'Close': 17.43428611755371,
 'Adj Close': 15.272941589355469,
 'Volume': 284460400}

In [26]:
rowLast = df.filter("Date == '"+end_date+"'").head()
dictLast = rowLast.asDict()
dictLast

{'Date': datetime.date(2007, 12, 26),
 'Open': 28.43000030517578,
 'High': 28.70857048034668,
 'Low': 28.117143630981445,
 'Close': 28.421428680419922,
 'Adj Close': 24.89799690246582,
 'Volume': 175933100}