In [3]:
!pip3 install pyspark==3.0.0

Collecting pyspark==3.0.0
  Downloading pyspark-3.0.0.tar.gz (204.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.0.0)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.0-py2.py3-none-any.whl size=205044159 sha256=b6eec6050a3124caebc2a1f14874f5a1bdee839c84054cf00883e0bca5184c0e
  Stored in directory: /root/.cache/pip/wheels/b1/bb/8b/ca24d3f756f2ed967225b0871898869db676eb5846df5adc56
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0

In [23]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql as sql
import xml.etree.ElementTree as ET
from pyspark.sql.functions import explode
from pyspark.sql import Window
from pyspark.sql.functions import row_number
import requests

In [5]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.14.0 pyspark-shell'

In [24]:
try:
    sc = SparkContext.getOrCreate()
    sc.setLogLevel("ERROR")
except:
    conf = SparkConf().setAppName("lb2").setMaster('local[1]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
spark = SparkSession(sc)

sc

In [None]:
with open("posts_sample.xml", "wb") as f:
    request = requests.get("https://git.ai.ssau.ru/tk/big_data/raw/branch/bachelor/data/posts_sample.xml")
    f.write(request.content)

with open("programming-languages.csv", "wb") as f:
    request = requests.get("https://git.ai.ssau.ru/tk/big_data/raw/branch/bachelor/data/programming-languages.csv")
    f.write(request.content)

In [13]:
postsSample = spark.read.format('xml')\
  .option('rootTag', 'posts')\
  .option('rowTag', 'row')\
  .load("posts_sample.xml")\
  .select("_CreationDate", "_ViewCount", "_Tags")

In [15]:
languages = spark.read\
  .option("header", True)\
  .option("inferSchema", True)\
  .csv("programming-languages.csv")

languages

DataFrame[name: string, wikipedia_url: string]

In [None]:
names = languages.select("name").rdd.map(lambda x: x[0].lower()).collect()

ps = postsSample\
  .filter(col("_Tags").isNotNull())\
  .rdd\
  .map(lambda x: (str(x["_CreationDate"]).split('-')[0], x["_ViewCount"], x["_Tags"][1:-1].split('><')))\
  .toDF(["year", "views", "languages"])\
  .filter((col("year") >= 2010) & (col("year") <= 2020))\
  .select("year", "views", explode("languages").alias("language"))\
  .filter(col("language").isin(names))\
  .groupBy("year", "language")\
  .agg({"views": "sum"})\

ps.show()

In [None]:
window = Window.partitionBy(ps['year']).orderBy(ps['sum(views)'].desc())
ps10 = ps\
  .select('*', row_number().over(window).alias('row_number'))\
  .filter(col('row_number') <= 10)\
  .withColumnRenamed("sum(views)", "views")\
  .orderBy(col("year").asc(), col("sum(views)").desc())

ps10.show()

In [22]:
ps10.write.mode('overwrite').parquet("top_langs.parquet")