In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.functions import col, round

conf = (
     SparkConf()
    .setAppName('Simple_Spark')
    .setMaster('local[*]')
)


# Spark сессия
spark = (
    SparkSession
    .builder
    .config(conf=conf)
    # .enableHiveSupport()
    .getOrCreate()
)

24/09/04 21:55:37 WARN Utils: Your hostname, antonio-f513 resolves to a loopback address: 127.0.1.1; using 192.168.100.11 instead (on interface wlo1)
24/09/04 21:55:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/04 21:55:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.json("people.json")
df.show()

+---+-----------+-----+
|age| department| name|
+---+-----------+-----+
| 30|         HR| John|
| 25|    Finance|  Doe|
| 35|         HR| Jane|
| 40|    Finance| Mark|
| 23|Engineering|Smith|
+---+-----------+-----+



In [6]:
filtered_df = df.filter(col("age") >= 30)
filtered_df.show()

+---+----------+----+
|age|department|name|
+---+----------+----+
| 30|        HR|John|
| 35|        HR|Jane|
| 40|   Finance|Mark|
+---+----------+----+



In [24]:
from pyspark.sql.functions import sum, avg, max, count
agg_df = (df
.groupby("department")
 .agg(avg("age").alias("avg_age"),
      count("name").alias("count_name")
     )
)
agg_df.show()

+-----------+-------+----------+
| department|avg_age|count_name|
+-----------+-------+----------+
|Engineering|   23.0|         1|
|         HR|   32.5|         2|
|    Finance|   32.5|         2|
+-----------+-------+----------+



In [22]:
df.orderBy("age").show()

+---+-----------+-----+
|age| department| name|
+---+-----------+-----+
| 23|Engineering|Smith|
| 25|    Finance|  Doe|
| 30|         HR| John|
| 35|         HR| Jane|
| 40|    Finance| Mark|
+---+-----------+-----+



In [27]:
people_df = spark.read.json("people.json")
departments_df = spark.read.json("departments.json")

In [28]:
departments_df.show()

+---------------+---+
|department_name| id|
+---------------+---+
|             HR|  1|
|        Finance|  2|
|    Engineering|  3|
+---------------+---+



In [29]:
# spark sql
people_df.createOrReplaceTempView("people")
departments_df.createOrReplaceTempView("departments")

In [32]:
join_df = spark.sql("""
SELECT p.name, p.age, d.department_name
FROM people p
JOIN departments d ON d.department_name = p.department
""")
join_df.show()

+-----+---+---------------+
| name|age|department_name|
+-----+---+---------------+
| John| 30|             HR|
|  Doe| 25|        Finance|
| Jane| 35|             HR|
| Mark| 40|        Finance|
|Smith| 23|    Engineering|
+-----+---+---------------+



In [33]:
join_df.write.csv("output.csv", header=True)
!ls -lh output.csv

итого 4,0K
-rw-r--r-- 1 antonio antonio 99 сен  4 22:25 part-00000-e7b2a878-0751-443b-bb79-afabbe8c11a4-c000.csv
-rw-r--r-- 1 antonio antonio  0 сен  4 22:25 _SUCCESS


In [None]:
# Join

In [42]:
# Пример данных для DataFrame people
people_data = [
    ("John", 30, 1),
    ("Doe", 25, 2),
    ("Jane", 35, 1),
    ("Mark", 40, 2),
    ("Smith", 23, 3),
    ("Mike", 33, 5)
]
people_columns = ["name", "age", "department_id"]
people_df = spark.createDataFrame(data=people_data, schema=people_columns)

# Пример данных для DataFrame departments
departments_data = [
    (1, "HR"),
    (2, "Finance"),
    (3, "Engineering"),
    (4, "Marketing")
]
departments_columns = ["id", "department_name"]
departments_df = spark.createDataFrame(data=departments_data, schema=departments_columns)

In [43]:
inner_join_df = (
people_df.alias("p")
    .join(departments_df.alias("d"), col("p.department_id") == col("d.id"), "inner")
)
inner_join_df.show()

+-----+---+-------------+---+---------------+
| name|age|department_id| id|department_name|
+-----+---+-------------+---+---------------+
| John| 30|            1|  1|             HR|
| Jane| 35|            1|  1|             HR|
|  Doe| 25|            2|  2|        Finance|
| Mark| 40|            2|  2|        Finance|
|Smith| 23|            3|  3|    Engineering|
+-----+---+-------------+---+---------------+



In [44]:
left_join_df = (
people_df.alias("p")
    .join(departments_df.alias("d"), col("p.department_id") == col("d.id"), "left")
)
left_join_df.show()

+-----+---+-------------+----+---------------+
| name|age|department_id|  id|department_name|
+-----+---+-------------+----+---------------+
| John| 30|            1|   1|             HR|
| Jane| 35|            1|   1|             HR|
|  Doe| 25|            2|   2|        Finance|
| Mark| 40|            2|   2|        Finance|
| Mike| 33|            5|NULL|           NULL|
|Smith| 23|            3|   3|    Engineering|
+-----+---+-------------+----+---------------+



In [45]:
right_join_df = (
people_df.alias("p")
    .join(departments_df.alias("d"), col("p.department_id") == col("d.id"), "right")
)
right_join_df.show()

+-----+----+-------------+---+---------------+
| name| age|department_id| id|department_name|
+-----+----+-------------+---+---------------+
| Jane|  35|            1|  1|             HR|
| John|  30|            1|  1|             HR|
| Mark|  40|            2|  2|        Finance|
|  Doe|  25|            2|  2|        Finance|
|Smith|  23|            3|  3|    Engineering|
| NULL|NULL|         NULL|  4|      Marketing|
+-----+----+-------------+---+---------------+



In [46]:
full_join_df = (
people_df.alias("p")
    .join(departments_df.alias("d"), col("p.department_id") == col("d.id"), "full")
)
full_join_df.show()

+-----+----+-------------+----+---------------+
| name| age|department_id|  id|department_name|
+-----+----+-------------+----+---------------+
| John|  30|            1|   1|             HR|
| Jane|  35|            1|   1|             HR|
|  Doe|  25|            2|   2|        Finance|
| Mark|  40|            2|   2|        Finance|
|Smith|  23|            3|   3|    Engineering|
| NULL|NULL|         NULL|   4|      Marketing|
| Mike|  33|            5|NULL|           NULL|
+-----+----+-------------+----+---------------+



In [47]:
cross_join = people_df.crossJoin(departments_df)
cross_join.show()



+-----+---+-------------+---+---------------+
| name|age|department_id| id|department_name|
+-----+---+-------------+---+---------------+
| John| 30|            1|  1|             HR|
| John| 30|            1|  2|        Finance|
| John| 30|            1|  3|    Engineering|
| John| 30|            1|  4|      Marketing|
|  Doe| 25|            2|  1|             HR|
| Jane| 35|            1|  1|             HR|
|  Doe| 25|            2|  2|        Finance|
| Jane| 35|            1|  2|        Finance|
|  Doe| 25|            2|  3|    Engineering|
| Jane| 35|            1|  3|    Engineering|
|  Doe| 25|            2|  4|      Marketing|
| Jane| 35|            1|  4|      Marketing|
| Mark| 40|            2|  1|             HR|
| Mark| 40|            2|  2|        Finance|
| Mark| 40|            2|  3|    Engineering|
| Mark| 40|            2|  4|      Marketing|
|Smith| 23|            3|  1|             HR|
| Mike| 33|            5|  1|             HR|
|Smith| 23|            3|  2|     

                                                                                

In [50]:
cond_join_df = (
people_df.alias("p")
    .join(departments_df.alias("d"), (col("p.department_id") == col("d.id")) & (col("age") >= 30))
)
cond_join_df.show()

+----+---+-------------+---+---------------+
|name|age|department_id| id|department_name|
+----+---+-------------+---+---------------+
|John| 30|            1|  1|             HR|
|Jane| 35|            1|  1|             HR|
|Mark| 40|            2|  2|        Finance|
+----+---+-------------+---+---------------+

