In [None]:
import datetime
startTime = datetime.now()

In [44]:
stock_prices = spark.read.format('csv').option('header','true').option('inferSchema','true').option('mode','DROPMALFORMED')\
.load("file:///media/alessandro/storage/big_data-primoProgetto/dataset/historical_stock_prices.csv")

stocks = spark.read.format('csv').option('header','true').option('inferSchema','true').option('mode','DROPMALFORMED')\
.load("file:///media/alessandro/storage/big_data-primoProgetto/dataset/historical_stocks.csv")

In [45]:
stock_prices.show(2,truncate= True)

+------+----------------+----------------+----------------+-----+----------------+-------+-------------------+
|ticker|            open|           close|       adj_close|  low|            high| volume|               date|
+------+----------------+----------------+----------------+-----+----------------+-------+-------------------+
|   AHH|            11.5|11.5799999237061|8.49315452575684|11.25|11.6800003051758|4633900|2013-05-08 00:00:00|
|   AHH|11.6599998474121|11.5500001907349|8.47115135192871| 11.5|11.6599998474121| 275800|2013-05-09 00:00:00|
+------+----------------+----------------+----------------+-----+----------------+-------+-------------------+
only showing top 2 rows



In [46]:
from pyspark.sql.functions import year

stock_prices=stock_prices.select('ticker','close','volume',year("date").alias('year'))

In [47]:
stock_prices.show(2,truncate= True)

+------+----------------+-------+----+
|ticker|           close| volume|year|
+------+----------------+-------+----+
|   AHH|11.5799999237061|4633900|2013|
|   AHH|11.5500001907349| 275800|2013|
+------+----------------+-------+----+
only showing top 2 rows



In [48]:
stocks.show(2,truncate= True)

+------+--------+--------------------+-------+--------------------+
|ticker|exchange|                name| sector|            industry|
+------+--------+--------------------+-------+--------------------+
|   PIH|  NASDAQ|1347 PROPERTY INS...|FINANCE|PROPERTY-CASUALTY...|
| PIHPP|  NASDAQ|1347 PROPERTY INS...|FINANCE|PROPERTY-CASUALTY...|
+------+--------+--------------------+-------+--------------------+
only showing top 2 rows



In [49]:
stocks=stocks.select('ticker','sector')

In [50]:
stocks.show(2,truncate= True)

+------+-------+
|ticker| sector|
+------+-------+
|   PIH|FINANCE|
| PIHPP|FINANCE|
+------+-------+
only showing top 2 rows



In [51]:
joined = stock_prices.join(stocks, on='ticker')

In [52]:
joined.show(2,truncate= True)

+------+----------------+-------+----+-------+
|ticker|           close| volume|year| sector|
+------+----------------+-------+----+-------+
|   AHH|11.5799999237061|4633900|2013|FINANCE|
|   AHH|11.5500001907349| 275800|2013|FINANCE|
+------+----------------+-------+----+-------+
only showing top 2 rows



In [53]:
filtered = joined.filter((joined.year <= '2018') & (joined.year >= '2004') & (joined.sector != 'N/A'))

In [54]:
filtered.show(2,truncate= True)

+------+----------------+-------+----+-------+
|ticker|           close| volume|year| sector|
+------+----------------+-------+----+-------+
|   AHH|11.5799999237061|4633900|2013|FINANCE|
|   AHH|11.5500001907349| 275800|2013|FINANCE|
+------+----------------+-------+----+-------+
only showing top 2 rows



In [55]:
from pyspark.sql import functions as F

In [56]:
intermediate1 = filtered.groupBy('sector','year').agg(F.sum(filtered.volume).alias('volCompl'),F.mean(filtered.close).alias('avg_volume'))

In [57]:
intermediate1 = intermediate1.sort(F.desc('sector'),F.desc('year'))

intermediate1.show(15,truncate= True)

+--------------+----+-----------+------------------+
|        sector|year|   volCompl|        avg_volume|
+--------------+----+-----------+------------------+
|TRANSPORTATION|2018|15782363800|38.687121291895146|
|TRANSPORTATION|2017|24750549271| 81.58107063923603|
|TRANSPORTATION|2016|26396056641|182.52985818188077|
|TRANSPORTATION|2015|26978967802| 38.55247294917033|
|TRANSPORTATION|2014|24014045100|51.800322936979065|
|TRANSPORTATION|2013|20484526900| 88.79075188995279|
|TRANSPORTATION|2012|20500094900| 76.24410309342095|
|TRANSPORTATION|2011|24018175100| 91.58842951034882|
|TRANSPORTATION|2010|25103251800|164.75943292272206|
|TRANSPORTATION|2009|32166875600| 419.3358372236049|
|TRANSPORTATION|2008|34045323700|1876.2985562946167|
|TRANSPORTATION|2007|22648103900|4028.5854687609935|
|TRANSPORTATION|2006|15731458900|  4601.59876628527|
|TRANSPORTATION|2005|11779550500| 6546.023289996812|
|TRANSPORTATION|2004|10146574000|  3875.09863624273|
+--------------+----+-----------+-------------

In [58]:
intermediate2 = filtered.groupBy('sector','year').agg(F.sum(filtered.close).alias('actualQuote'))

In [59]:
intermediate2 = intermediate2.sort(F.desc('sector'),F.desc('year'))

intermediate2.show(15,truncate= True)

+--------------+----+-------------------+
|        sector|year|        actualQuote|
+--------------+----+-------------------+
|TRANSPORTATION|2018|   696213.434768945|
|TRANSPORTATION|2017| 2093370.2726027966|
|TRANSPORTATION|2016|  4419230.396441516|
|TRANSPORTATION|2015|  908257.7102095038|
|TRANSPORTATION|2014|  1148205.958221078|
|TRANSPORTATION|2013|  1809289.151261568|
|TRANSPORTATION|2012|  1497205.452445507|
|TRANSPORTATION|2011| 1807864.0101047754|
|TRANSPORTATION|2010| 3128287.3529037237|
|TRANSPORTATION|2009|  7604655.408050075|
|TRANSPORTATION|2008|3.358762045622993E7|
|TRANSPORTATION|2007|6.668920384986949E7|
|TRANSPORTATION|2006|6.898716870414877E7|
|TRANSPORTATION|2005|8.569399088934827E7|
|TRANSPORTATION|2004|4.493951888450694E7|
+--------------+----+-------------------+
only showing top 15 rows



In [62]:
from pyspark.sql.window import Window

intermediate3 = intermediate2.withColumn('previousQuote',
                                
                    F.lead('actualQuote').over(Window.partitionBy('sector').orderBy(F.desc('sector'),F.desc('year'))))

In [72]:
intermediate3 = intermediate3.sort(F.desc('sector'),F.desc('year'))

In [73]:
intermediate3.show(15,truncate= True)

+--------------+----+-------------------+-------------------+
|        sector|year|        actualQuote|      previousQuote|
+--------------+----+-------------------+-------------------+
|TRANSPORTATION|2018|   696213.434768945| 2093370.2726027966|
|TRANSPORTATION|2017| 2093370.2726027966|  4419230.396441516|
|TRANSPORTATION|2016|  4419230.396441516|  908257.7102095038|
|TRANSPORTATION|2015|  908257.7102095038|  1148205.958221078|
|TRANSPORTATION|2014|  1148205.958221078|  1809289.151261568|
|TRANSPORTATION|2013|  1809289.151261568|  1497205.452445507|
|TRANSPORTATION|2012|  1497205.452445507| 1807864.0101047754|
|TRANSPORTATION|2011| 1807864.0101047754| 3128287.3529037237|
|TRANSPORTATION|2010| 3128287.3529037237|  7604655.408050075|
|TRANSPORTATION|2009|  7604655.408050075|3.358762045622993E7|
|TRANSPORTATION|2008|3.358762045622993E7|6.668920384986949E7|
|TRANSPORTATION|2007|6.668920384986949E7|6.898716870414877E7|
|TRANSPORTATION|2006|6.898716870414877E7|8.569399088934827E7|
|TRANSPO

In [86]:
result = intermediate3.withColumn('percentage', 
          (100*(intermediate3['actualQuote'] - intermediate3['previousQuote']) / intermediate3['previousQuote']) )

result = result.join(intermediate1, on=['sector','year'])

result = result.select('sector','year','volCompl','percentage','avg_volume')

result = result.sort(F.desc('sector'),F.desc('year'))

In [87]:
result.show(15,truncate= True)

+--------------+----+-----------+-------------------+------------------+
|        sector|year|   volCompl|         percentage|        avg_volume|
+--------------+----+-----------+-------------------+------------------+
|TRANSPORTATION|2018|15782363800| -66.74198330411436|38.687121291895146|
|TRANSPORTATION|2017|24750549271|-52.630433699758335| 81.58107063923603|
|TRANSPORTATION|2016|26396056641|  386.5612861598666|182.52985818188077|
|TRANSPORTATION|2015|26978967802|-20.897666162900542| 38.55247294917033|
|TRANSPORTATION|2014|24014045100| -36.53828314725343|51.800322936979065|
|TRANSPORTATION|2013|20484526900| 20.844413724670144| 88.79075188995279|
|TRANSPORTATION|2012|20500094900|-17.183734834196077| 76.24410309342095|
|TRANSPORTATION|2011|24018175100| -42.20914493591234| 91.58842951034882|
|TRANSPORTATION|2010|25103251800| -58.86352260495319|164.75943292272206|
|TRANSPORTATION|2009|32166875600|  -77.3587550866839| 419.3358372236049|
|TRANSPORTATION|2008|34045323700| -49.6355953928580