In [5]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=4ac7d52feb7bd5deb47012c266b4b8bffdb7e0d856abe66e9202ceabbd2900b3
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, length, col, sum, count, regexp_extract

Программа, которая находит самое длинное слово.

In [6]:
spark = SparkSession.builder.appName("WikiAnalysis").getOrCreate()

df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
longest_word = words_df.withColumn("length", length(col("word"))) \
                       .orderBy(col("length").desc()) \
                       .first()

print(f"Самое длинное слово: {longest_word['word']}")

spark.stop()

Самое длинное слово: [https://ru.wikipedia.org/wiki/%D0%9C%D0%B5%D1%85%D0%B0%D0%BD%D0%B8%D0%BA%D0%B0_%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D0%BD%D0%BE%D0%B3%D0%BE_%D0%B2%D0%B7%D0%B0%D0%B8%D0%BC%D0%BE%D0%B4%D0%B5%D0%B9%D1%81%D1%82%D0%B2%D0%B8%D1%8F#.D0.AD.D0.BD.D0.B5.D1.80.D0.B3.D0.B8.D1.8F_.D0.BF.D1.80.D0.B8_.D1.83.D0.BF.D1.80.D1.83.D0.B3.D0.BE.D0.BC_.D0.BA.D0.BE.D0.BD.D1.82.D0.B0.D0.BA.D1.82.D0.B5]


Программа, которая находит среднюю длину слов.

In [None]:
spark = SparkSession.builder.appName("WikiAnalysis").getOrCreate()

df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
average_length = words_df.withColumn("length", length(col("word"))) \
                         .selectExpr("avg(length) as avg_length") \
                         .first()

print(f"Средняя длина слов: {average_length['avg_length']}")

spark.stop()

Средняя длина слов: 6.53168068423701


Программа, которая находит самое частоупотребляемое слово, состоящее из латинских букв.

In [7]:
spark = SparkSession.builder.appName("WikiAnalysis").getOrCreate()

df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
latin_words_df = words_df.withColumn("latin_word", regexp_extract(col("word"), "^[a-zA-Z]+$", 0)) \
                         .filter(col("latin_word") != "")
most_common_word = latin_words_df.groupBy("latin_word") \
                                 .count() \
                                 .orderBy(col("count").desc()) \
                                 .first()

print(f"Самое частоупотребляемое слово: {most_common_word['latin_word']}")

spark.stop()

Самое частоупотребляемое слово: XX


Все слова, которые более чем в половине случаев начинаются с большой буквы и встречаются больше 10 раз

In [9]:
spark = SparkSession.builder.appName("WikiAnalysis").getOrCreate()
df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
capitalized_words_df = words_df.withColumn("is_capitalized", col("word").substr(1, 1).rlike("[A-ZА-Я]"))
grouped_words_df = capitalized_words_df.groupBy("word") \
                                       .agg(sum(col("is_capitalized").cast("int")).alias("capitalized_count"),
                                            count("word").alias("total_count"))

result_df = grouped_words_df.filter((col("capitalized_count") > col("total_count") / 2) &
                                    (col("total_count") > 10))

result_df.show(truncate=False)

spark.stop()


+-----------------+-----------------+-----------+
|word             |capitalized_count|total_count|
+-----------------+-----------------+-----------+
|Население —      |169              |169        |
|Всемирного       |186              |186        |
|XVII             |955              |955        |
|Дона,            |24               |24         |
|Ниже             |231              |231        |
|Принцип          |106              |106        |
|Николаевский     |14               |14         |
|Демократическая  |65               |65         |
|Северо-Восточного|14               |14         |
|Каспийское       |42               |42         |
|Подобная         |61               |61         |
|Ярославом        |11               |11         |
|Педру            |36               |36         |
|Новгорода —      |11               |11         |
|Русском          |52               |52         |
|Медицинский      |19               |19         |
|Ассамблея        |48               |48         |


Программа, которая с помощью статистики определяет устойчивые сокращения вида пр., др., ..

In [14]:
spark = SparkSession.builder.appName("AbbreviationAnalysis").getOrCreate()

df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
abbreviations_df = words_df.withColumn("abbreviation", regexp_extract(col("word"), r"\b[a-zA-Zа-яА-Я]+\.\b", 0)) \
                           .filter(col("abbreviation") != "")

abbreviation_counts = abbreviations_df.groupBy("abbreviation") \
                                      .agg(count("abbreviation").alias("count")) \
                                      .orderBy(col("count").desc())
abbreviation_counts.show(truncate=False)

spark.stop()


+------------+-----+
|abbreviation|count|
+------------+-----+
|т.          |340  |
|А.          |203  |
|www.        |173  |
|В.          |140  |
|н.          |108  |
|М.          |100  |
|г.          |89   |
|тыс.        |84   |
|С.          |74   |
|Mail.       |70   |
|Н.          |63   |
|млн.        |60   |
|кв.         |59   |
|с.          |53   |
|OpenOffice. |53   |
|ст.         |50   |
|И.          |50   |
|л.          |47   |
|П.          |46   |
|Ф.          |45   |
+------------+-----+
only showing top 20 rows



Найти устойчивые сокращения вида т.п., н.э.

In [15]:
spark = SparkSession.builder.appName("AbbreviationAnalysis").getOrCreate()

df = spark.read.option("delimiter", "\t").csv("wiki.txt").toDF("url", "title", "text")

words_df = df.select(explode(split(col("text"), "\\s+")).alias("word"))
compound_abbreviations_df = words_df.withColumn("compound_abbreviation", regexp_extract(col("word"), r"\b[a-zA-Zа-яА-Я]+\.[a-zA-Zа-яА-Я]+\.\b", 0)) \
                                    .filter(col("compound_abbreviation") != "")

compound_abbreviation_counts = compound_abbreviations_df.groupBy("compound_abbreviation") \
                                                       .agg(count("compound_abbreviation").alias("count")) \
                                                       .orderBy(col("count").desc())

compound_abbreviation_counts.show(truncate=False)

spark.stop()

+---------------------+-----+
|compound_abbreviation|count|
+---------------------+-----+
|э.д.                 |11   |
|R.E.                 |10   |
|тыс.кв.              |9    |
|ru.wikipedia.        |8    |
|и.т.                 |8    |
|А.М.                 |6    |
|д.м.                 |5    |
|д.и.                 |5    |
|Q.E.                 |5    |
|www.youtube.         |4    |
|www.iecat.           |4    |
|а.е.                 |4    |
|U.S.                 |4    |
|S.T.                 |4    |
|M.D.                 |4    |
|magazines.russ.      |3    |
|web.archive.         |3    |
|л.ед.                |3    |
|en.wikipedia.        |3    |
|www.bookcrossing.    |3    |
+---------------------+-----+
only showing top 20 rows

