In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import when
from pyspark.sql.types import *
from pyspark.sql.window import Window as w

spark = SparkSession.builder \
      .master("local") \
      .appName("Spark") \
      .config("spark.executor.memory", "10g")\
      .config("spark.executor.cores", 5) \
      .config("spark.dynamicAllocation.enabled", "true") \
      .config("spark.dynamicAllocation.maxExecutors", 5) \
      .config("spark.shuffle.service.enabled", "true") \
      .getOrCreate() 

23/08/30 09:11:56 WARN Utils: Your hostname, MacBook-Pro-Aleksey.local resolves to a loopback address: 127.0.0.1; using 10.122.249.45 instead (on interface en0)
23/08/30 09:11:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/30 09:11:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Задание 4.8.1

In [2]:
data = [(1, 1562007679), (1, 1562007710), (1, 1562007720), (1, 1562007750), (2, 1564682430), (2, 1564682450), (2, 1564682480)]
cols = ['id', 'timestamp']

df = spark.createDataFrame(data, cols)

window = w.partitionBy('id')

df = df.withColumn('min', f.min('timestamp').over(window))\
       .withColumn('max', f.max('timestamp').over(window))

df = df.withColumn('avg', f.col('max') - f.col('min'))
df.select('id', 'avg').distinct().show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+---+
| id|avg|
+---+---+
|  1| 71|
|  2| 50|
+---+---+



                                                                                

Задание 4.8.2

In [17]:
import datetime as dt
import pandas as pd

start_date = dt.datetime(2023, 8, 1)
end_date = dt.datetime(2023, 8, 31)

dates = pd.date_range(min(start_date, end_date), max(start_date, end_date))\
          .strftime('%Y-%m-%d').tolist()
week_num = [dt.datetime.strptime(i, '%Y-%m-%d').date().isocalendar()[1] for i in dates]
week = [i - min(week_num) + 1 for i in week_num]
data = tuple(zip(dates, week))
df_day = spark.createDataFrame(data = data, schema = ['day', 'week'])

week_data = (('1', '01.08—06.08'),
             ('2', '07.08—13.08'),
             ('3', '14.08—20.08'),
             ('4', '21.08—27.08'),
             ('5', '28.08—31.08'))
df_week = spark.createDataFrame(data = week_data, schema = ['week', 'week_name'])

demand_data = (('1', '01', 100),
               ('1', '02', 110),
               ('2', '01', 120),
               ('2', '02', 90),
               ('3', '01', 70),
               ('3', '02', 80))
df_demand = spark.createDataFrame(data = demand_data, schema = ['product', 'location', 'demand'])

stock_data = (('1', '01', 1000),
              ('1', '02', 400),
              ('2', '01', 300),
              ('2', '02', 250))
df_stock = spark.createDataFrame(data = stock_data, schema = ['product', 'location', 'stock'])

df = df_demand.join(df_stock, ['product', 'location'], 'left')\
              .select(df_demand['*'], f.coalesce(df_stock.stock, f.lit(0)).alias('stock'))\
              .crossJoin(df_day)

wind = w.partitionBy('product', 'location').orderBy('day')

df = df.withColumn('sum_demand', f.sum('demand').over(wind))
df = df.withColumn('diff', f.col('sum_demand') - f.col('stock'))
df = df.withColumn('lag', f.lag('diff', 1, 1).over(wind))
df = df.withColumn('sales', when(f.col('lag') > f.col('demand'), f.col('demand'))\
                         .when((f.col('lag') > 0) & (f.col('lag') <= f.col('demand')), f.col('lag'))
                         .otherwise(f.lit(0)))
df = df.withColumn('min_lag', f.min('lag').over(wind))
df = df.join(df_week, ['week'])
df = df.groupBy('product', 'location', 'week_name').agg(f.sum('sales').alias('sales'), f.avg('min_lag').alias('stock'))

df.show()






+-------+--------+-----------+-----+-------------------+
|product|location|  week_name|sales|              stock|
+-------+--------+-----------+-----+-------------------+
|      3|      02|28.08—31.08|  320|                1.0|
|      1|      01|01.08—06.08|    1| -749.8333333333334|
|      1|      02|21.08—27.08|  770|             -290.0|
|      3|      01|14.08—20.08|  490|                1.0|
|      1|      02|14.08—20.08|  770|             -290.0|
|      2|      01|28.08—31.08|  480|             -180.0|
|      3|      01|07.08—13.08|  490|                1.0|
|      3|      02|01.08—06.08|  401|                1.0|
|      1|      02|07.08—13.08|  770|             -290.0|
|      2|      02|28.08—31.08|  360|             -160.0|
|      2|      02|01.08—06.08|  201|-133.16666666666666|
|      2|      02|07.08—13.08|  630|             -160.0|
|      1|      01|21.08—27.08|  700|             -900.0|
|      3|      02|07.08—13.08|  560|                1.0|
|      2|      01|21.08—27.08| 

                                                                                