In [1]:
import pyspark
import os
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
import pyspark.sql as sql
from datetime import datetime
from typing import NamedTuple

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS']='--packages com.databricks:spark-xml_2.12:0.13.0 pyspark-shell'
sc = SparkSession.builder.appName("Lab1").master("yarn").getOrCreate()

In [3]:
programming_languages = sc.read.csv("programming-languages.csv")
programming_languages = [str(x[0]).lower() for x in programming_languages.collect()]
programming_languages[1:11]

['a# .net',
 'a# (axiom)',
 'a-0 system',
 'a+',
 'a++',
 'abap',
 'abc',
 'abc algol',
 'abset',
 'absys']

In [4]:
post_sample = sc.read.format("xml").options(rowTag="row").load("posts_sample.xml")
post_sample.first()

Row(_AcceptedAnswerId=7, _AnswerCount=13, _Body="<p>I want to use a track-bar to change a form's opacity.</p>\n\n<p>This is my code:</p>\n\n<pre><code>decimal trans = trackBar1.Value / 5000;\nthis.Opacity = trans;\n</code></pre>\n\n<p>When I build the application, it gives the following error:</p>\n\n<blockquote>\n  <p>Cannot implicitly convert type <code>'decimal'</code> to <code>'double'</code></p>\n</blockquote>\n\n<p>I tried using <code>trans</code> and <code>double</code> but then the control doesn't work. This code worked fine in a past VB.NET project.</p>\n", _ClosedDate=None, _CommentCount=2, _CommunityOwnedDate=datetime.datetime(2012, 10, 31, 20, 42, 47, 213000), _CreationDate=datetime.datetime(2008, 8, 1, 2, 42, 52, 667000), _FavoriteCount=48, _Id=4, _LastActivityDate=datetime.datetime(2019, 7, 19, 5, 39, 54, 173000), _LastEditDate=datetime.datetime(2019, 7, 19, 5, 39, 54, 173000), _LastEditorDisplayName='Rich B', _LastEditorUserId=3641067, _OwnerDisplayName=None, _OwnerUserI

In [10]:
def find_language(x):
    for l in programming_languages:
        if "<" + l + ">" in x._Tags.lower():
            return (x._Id, l)
    return None

def isValid(x, y):
    if x._Tags is None:
        return False
    return x._CreationDate>=datetime(year=y, month=1, day=1) and datetime(year=y, month=12, day=31)>=x._CreationDate
    

In [11]:
from pyspark.sql.functions import col
res = {}
for y in range(2010,2020):
    res[y]=(post_sample.rdd.filter(lambda x: isValid(x,y))\
               .map(find_language).filter(lambda x: x is not None).keyBy(lambda x: x[1])\
               .aggregateByKey(0,lambda x,y:x+1, lambda d,z:d+z).sortBy(lambda x:x[1],ascending=False).toDF())\
               .select(col("_1").alias("Language"), col("_2").alias("Year_{0}".format(y))).limit(10)
    res[y].show()

+-----------+---------+
|   Language|Year_2010|
+-----------+---------+
|       java|       52|
| javascript|       44|
|        php|       42|
|     python|       25|
|objective-c|       22|
|          c|       20|
|       ruby|       11|
|     delphi|        7|
|applescript|        3|
|          r|        3|
+-----------+---------+

+-----------+---------+
|   Language|Year_2011|
+-----------+---------+
|        php|       97|
|       java|       92|
| javascript|       82|
|     python|       35|
|objective-c|       33|
|          c|       24|
|       ruby|       17|
|       perl|        8|
|     delphi|        8|
|       bash|        7|
+-----------+---------+

+-----------+---------+
|   Language|Year_2012|
+-----------+---------+
|        php|      136|
| javascript|      129|
|       java|      124|
|     python|       65|
|objective-c|       45|
|          c|       27|
|       ruby|       25|
|       bash|        9|
|          r|        9|
|     matlab|        6|
+-----------+-

In [65]:
for i in result.keys():
    result[i].write.format("parquet").save("{0}".format(i))