In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.functions import col, round, to_date, year, month, sum, avg

conf = (
     SparkConf()
    .setAppName('Simple_Spark')
    .setMaster('local[*]')
)


# Spark сессия
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    # .enableHiveSupport()
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
!hdfs dfs -ls -h ./data_sets/

Found 5 items
-rw-r-----   3 akbogdanov hdfs      7.5 M 2024-04-15 16:09 data_sets/airport-codes.csv
-rw-r-----   3 akbogdanov hdfs        314 2024-09-06 11:11 data_sets/authors.csv
-rw-r-----   3 akbogdanov hdfs        824 2024-09-06 11:11 data_sets/books.csv
-rw-r--r--   3 akbogdanov hdfs      2.1 K 2023-09-04 22:22 data_sets/speech_ivr.csv
-rw-r-----   3 akbogdanov hdfs     75.1 K 2024-09-05 16:21 data_sets/weather_data.csv


In [4]:
author_df = spark.read.csv("./data_sets/authors.csv", header=True)
author_df.show()

                                                                                

+---------+---------+----------+---------+
|author_id|     name|birth_date|  country|
+---------+---------+----------+---------+
|        1| Author_1|1960-12-31|    India|
|        2| Author_2|1965-12-31|   Canada|
|        3| Author_3|1970-12-31|      USA|
|        4| Author_4|1975-12-31|       UK|
|        5| Author_5|1980-12-31|      USA|
|        6| Author_6|1985-12-31|      USA|
|        7| Author_7|1990-12-31|      USA|
|        8| Author_8|1995-12-31|Australia|
|        9| Author_9|2000-12-31|Australia|
|       10|Author_10|2005-12-31|    India|
+---------+---------+----------+---------+



In [5]:
books_df = spark.read.csv("./data_sets/books.csv", header=True)
books_df.show()

+-------+-------+---------+-----------+-----+------------+
|book_id|  title|author_id|      genre|price|publish_date|
+-------+-------+---------+-----------+-----+------------+
|      1| Book_1|        2|    Mystery|73.57|  1980-12-31|
|      2| Book_2|        1|Non-Fiction| 41.1|  1982-12-31|
|      3| Book_3|       10|    Fiction|10.63|  1984-12-31|
|      4| Book_4|        9|Non-Fiction|46.31|  1986-12-31|
|      5| Book_5|        7|    Science|31.13|  1988-12-31|
|      6| Book_6|        4|Non-Fiction| 83.7|  1990-12-31|
|      7| Book_7|        6|Non-Fiction|40.36|  1992-12-31|
|      8| Book_8|        2|Non-Fiction|84.48|  1994-12-31|
|      9| Book_9|        7|    Fantasy|10.05|  1996-12-31|
|     10|Book_10|        2|    Science| 37.7|  1998-12-31|
|     11|Book_11|       10|Non-Fiction| 31.7|  2000-12-31|
|     12|Book_12|        8|Non-Fiction|31.02|  2002-12-31|
|     13|Book_13|        8|Non-Fiction|16.14|  2004-12-31|
|     14|Book_14|        1|    Fiction|26.84|  2006-12-3

In [15]:
# Преобразуйте столбцы publish_date и birth_date в формат даты.

author_tr_df = (author_df
.withColumn("birth_date", to_date(col("birth_date")))
)
author_tr_df.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- country: string (nullable = true)



In [16]:
books_tr_df = (books_df
.withColumn("publish_date", to_date(col("publish_date")))
.withColumn("price", col("price").cast("Double"))
)
books_tr_df.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: date (nullable = true)



In [17]:
# Объедините таблицы books и authors по author_id

library_df = (
    author_tr_df.alias("a")
    .join(books_tr_df.alias("b"), col("a.author_id") == col("b.author_id"))
)
library_df.show()

+---------+---------+----------+---------+-------+-------+---------+-----------+-----+------------+
|author_id|     name|birth_date|  country|book_id|  title|author_id|      genre|price|publish_date|
+---------+---------+----------+---------+-------+-------+---------+-----------+-----+------------+
|        2| Author_2|1965-12-31|   Canada|      1| Book_1|        2|    Mystery|73.57|  1980-12-31|
|        1| Author_1|1960-12-31|    India|      2| Book_2|        1|Non-Fiction| 41.1|  1982-12-31|
|       10|Author_10|2005-12-31|    India|      3| Book_3|       10|    Fiction|10.63|  1984-12-31|
|        9| Author_9|2000-12-31|Australia|      4| Book_4|        9|Non-Fiction|46.31|  1986-12-31|
|        7| Author_7|1990-12-31|      USA|      5| Book_5|        7|    Science|31.13|  1988-12-31|
|        4| Author_4|1975-12-31|       UK|      6| Book_6|        4|Non-Fiction| 83.7|  1990-12-31|
|        6| Author_6|1985-12-31|      USA|      7| Book_7|        6|Non-Fiction|40.36|  1992-12-31|


In [18]:
# Найдите топ-5 авторов, книги которых принесли наибольшую выручку.
(library_df
 .groupby("name")
 .agg(sum(col("price")).alias("revenue"))
 .select("name", "revenue")
 .orderBy(col("revenue").desc())
 .limit(5)
 .show()
)


+--------+-------+
|    name|revenue|
+--------+-------+
|Author_2| 231.97|
|Author_7| 132.66|
|Author_1| 111.86|
|Author_8| 107.16|
|Author_5|  88.83|
+--------+-------+



In [22]:
# Найдите количество книг в каждом жанре
(
    library_df
    .groupBy("genre")
    .count()
    .orderBy(col("count").desc())
    .show()
)

+-----------+-----+
|      genre|count|
+-----------+-----+
|Non-Fiction|    9|
|    Science|    3|
|    Fiction|    3|
|    Fantasy|    3|
|    Mystery|    2|
+-----------+-----+



In [24]:
# Подсчитайте среднюю цену книг по каждому автору
(
    library_df
    .groupBy("name")
    .agg(avg(col("price")).alias("avg_price"))
    .orderBy(col("avg_price").desc())
    .show()
)


+---------+-----------------+
|     name|        avg_price|
+---------+-----------------+
| Author_5|            88.83|
| Author_4|             83.7|
| Author_2|          57.9925|
| Author_9|            46.31|
| Author_7|            44.22|
| Author_6|           43.965|
| Author_1|37.28666666666667|
| Author_8|            35.72|
|Author_10|           21.165|
+---------+-----------------+



                                                                                

In [27]:
# Найдите книги, опубликованные после 2000 года, и отсортируйте их по цене.

(
    library_df
    .filter(year(col("publish_date")) >= 2000)
    .orderBy(col("price").desc())
    .show()
)

+---------+---------+----------+---------+-------+-------+---------+-----------+-----+------------+
|author_id|     name|birth_date|  country|book_id|  title|author_id|      genre|price|publish_date|
+---------+---------+----------+---------+-------+-------+---------+-----------+-----+------------+
|        7| Author_7|1990-12-31|      USA|     20|Book_20|        7|    Mystery|91.48|  2018-12-31|
|        5| Author_5|1980-12-31|      USA|     19|Book_19|        5|    Science|88.83|  2016-12-31|
|        8| Author_8|1995-12-31|Australia|     15|Book_15|        8|    Fantasy| 60.0|  2008-12-31|
|        6| Author_6|1985-12-31|      USA|     17|Book_17|        6|    Fantasy|47.57|  2012-12-31|
|        1| Author_1|1960-12-31|    India|     18|Book_18|        1|Non-Fiction|43.92|  2014-12-31|
|        2| Author_2|1965-12-31|   Canada|     16|Book_16|        2|    Fiction|36.22|  2010-12-31|
|       10|Author_10|2005-12-31|    India|     11|Book_11|       10|Non-Fiction| 31.7|  2000-12-31|


In [28]:
spark.stop()