In [41]:
pip install --upgrade pyspark



In [46]:
import os
import sys
import pyspark.sql.functions as F
from pyspark.sql import Row
from pyspark.sql import SparkSession
import subprocess

In [47]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.17.0 pyspark-shell'

In [48]:
spark = SparkSession.builder.getOrCreate()
spark

Скачаем выборку данных в формате XML

In [50]:
try:
    subprocess.run(["wget", "https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/posts_sample.xml"], check=True)
except subprocess.CalledProcessError as e:
    print(f"Ошибка загрузки файла: {e}")
    raise

try:
    postsData = spark.read.format('xml') \
        .option('rowTag', 'row') \
        .option("timestampFormat", "y/M/d H:m:s") \
        .load('posts_sample.xml')
except Exception as e:
    print(f"Ошибка чтения XML: {e}")
    raise

In [51]:
postsData = spark.read.format('xml').option('rowTag', 'row').option("timestampFormat", 'y/M/d H:m:s').load('posts_sample.xml')

In [53]:
if postsData.count() == 0:
    print("DataFrame пуст")
else:
    print(f"Количество элементов: {postsData.count()}")

print("Схема данных:")
postsData.printSchema()

print("Первые 5 элементов:")
postsData.show(n=5, vertical=True)

Количество элементов: 46006
Схема данных:
root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)

Первые 5 элементов:
-RECORD 0------

Фильтруем записи в диапазоне дат с 2010 по 2020 год

In [54]:
dates = ("2010-01-01",  "2020-12-31")
posts_by_date = postsData.filter(F.col("_CreationDate").between(*dates))
posts_by_date.show(10)

+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|_AcceptedAnswerId|_AnswerCount|               _Body|_ClosedDate|_CommentCount| _CommunityOwnedDate|       _CreationDate|_FavoriteCount|    _Id|   _LastActivityDate|       _LastEditDate|_LastEditorDisplayName|_LastEditorUserId|_OwnerDisplayName|_OwnerUserId|_ParentId|_PostTypeId|_Score|_Tags|_Title|_ViewCount|
+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|             NULL|        NULL|<p>No. (And more ...|       NULL

Загрузка CSV с языками программирования

In [57]:
try:
    subprocess.run(["wget", "https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/programming-languages.csv"], check=True)
except Exception as e:
    raise Exception(f"Ошибка загрузки файла: {e}")

try:
    languagesData = spark.read.format("csv").option("header", True).option("inferSchema", True).load("programming-languages.csv").dropna(how="all")
except Exception as e:
    raise Exception(f"Ошибка чтения CSV: {e}")


In [60]:
if languagesData.count() == 0:
    print("DataFrame languagesData пуст")
else:
    print(f"\nКоличество языков программирования: {languagesData.count()}")
    print("\nСхема данных:")
    languagesData.printSchema()

    print("\nПервые 5 языков:")
    languagesData.show(5, vertical=True)


Количество языков программирования: 700

Схема данных:
root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)


Первые 5 языков:
-RECORD 0-----------------------------
 name          | A# .NET              
 wikipedia_url | https://en.wikipe... 
-RECORD 1-----------------------------
 name          | A# (Axiom)           
 wikipedia_url | https://en.wikipe... 
-RECORD 2-----------------------------
 name          | A-0 System           
 wikipedia_url | https://en.wikipe... 
-RECORD 3-----------------------------
 name          | A+                   
 wikipedia_url | https://en.wikipe... 
-RECORD 4-----------------------------
 name          | A++                  
 wikipedia_url | https://en.wikipe... 
only showing top 5 rows



In [61]:
language_names = [str(x[0]) for x in languagesData.collect()]
language_names

['A# .NET',
 'A# (Axiom)',
 'A-0 System',
 'A+',
 'A++',
 'ABAP',
 'ABC',
 'ABC ALGOL',
 'ABSET',
 'ABSYS',
 'ACC',
 'Accent',
 'Ace DASL',
 'ACL2',
 'ACT-III',
 'Action!',
 'ActionScript',
 'Ada',
 'Adenine',
 'Agda',
 'Agilent VEE',
 'Agora',
 'AIMMS',
 'Alef',
 'ALF',
 'ALGOL 58',
 'ALGOL 60',
 'ALGOL 68',
 'ALGOL W',
 'Alice',
 'Alma-0',
 'AmbientTalk',
 'Amiga E',
 'AMOS',
 'AMPL',
 'Apex (Salesforce.com)',
 'APL',
 "App Inventor for Android's visual block language",
 'AppleScript',
 'Arc',
 'ARexx',
 'Argus',
 'AspectJ',
 'Assembly language',
 'ATS',
 'Ateji PX',
 'AutoHotkey',
 'Autocoder',
 'AutoIt',
 'AutoLISP / Visual LISP',
 'Averest',
 'AWK',
 'Axum',
 'B',
 'Babbage',
 'Bash',
 'BASIC',
 'bc',
 'BCPL',
 'BeanShell',
 'Batch (Windows/Dos)',
 'Bertrand',
 'BETA',
 'Bigwig',
 'Bistro',
 'BitC',
 'BLISS',
 'Blockly',
 'BlooP',
 'Blue',
 'Boo',
 'Boomerang',
 'Bourne shell (including',
 'bash and',
 'ksh )',
 'BREW',
 'BPEL',
 'C',
 'C--',
 'C++ – ISO/IEC 14882',
 'C# – ISO/IEC

In [62]:
def includes_name(x):
    tag = None
    for name in language_names:
        n = '<' + name.lower() + '>'
        if n in str(x._Tags).lower():
            tag = name
            break
    if tag is None:
        tag = 'No'

    return (x[6], tag)

Преобразование DataFrame в RDD, группировка и агрегация, подготовка списка годов, создание итогового DataFrame и сохранение отчета в формате Apache Parquet.

In [65]:
posts_by_date_rdd = posts_by_date.rdd.map(includes_name).filter(lambda x: x[1] != 'No')
posts_by_date_rdd_group = posts_by_date_rdd.keyBy(lambda row: (row[0].year, row[1])).aggregateByKey(0, lambda x, y: x + 1, lambda x1, x2: x1 + x2).sortBy(lambda x: x[1], ascending=False).collect()
years = list(range(2020, 2009, -1))
df_by_years = []
for year in years:
    df_by_years.extend([row for row in posts_by_date_rdd_group if row[0][0] == year][:10])

row_template = Row('Year', 'Language', 'Count')
result_df = spark.createDataFrame([row_template(*x, y) for x, y in df_by_years])
result_df.show()

result_df.write.parquet("top_10_languages.parquet")


+----+----------+-----+
|Year|  Language|Count|
+----+----------+-----+
|2019|    Python|  162|
|2019|JavaScript|  131|
|2019|      Java|   95|
|2019|       PHP|   59|
|2019|         R|   36|
|2019|         C|   14|
|2019|      Dart|    9|
|2019|    MATLAB|    9|
|2019|        Go|    9|
|2019|      Bash|    8|
|2018|    Python|  214|
|2018|JavaScript|  196|
|2018|      Java|  145|
|2018|       PHP|   99|
|2018|         R|   63|
|2018|         C|   24|
|2018|     Scala|   22|
|2018|TypeScript|   21|
|2018|PowerShell|   13|
|2018|      Bash|   12|
+----+----------+-----+
only showing top 20 rows

