In [1]:
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
import os

In [None]:
# Инициализация Spark с настройками для Yandex Object Storage
spark = SparkSession.builder \
    .appName("Load_Parquet_Data") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("access_key")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("secret_key")) \
    .config("spark.hadoop.fs.s3a.endpoint", "storage.yandexcloud.net") \
    .config("spark.sql.parquet.enableVectorizedReader", "true") \
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
    .getOrCreate()

In [4]:
# Путь к данным
input_parquet_path = "s3a://fraud-detection-data-otus-2025/parquet/*"

In [5]:
def load_parquet_data():
    try:
        # Чтение данных с информацией об исходных файлах
        df = spark.read.parquet(input_parquet_path) \
            .withColumn("source_file", input_file_name())
        
        # Проверка загруженных данных
        print(f"Успешно загружено {df.count()} записей")
        print("Схема данных:")
        df.printSchema()
        
        # Пример статистики
        df.select("tx_fraud").groupBy("tx_fraud").count().show()
        
        return df
    except Exception as e:
        print(f"Ошибка при загрузке данных: {str(e)}")
        raise


In [6]:
if __name__ == "__main__":
    print("Начало загрузки данных из S3...")
    df = load_parquet_data()
    
    # Регистрация DataFrame как временной таблицы
    df.createOrReplaceTempView("fraud_transactions")
    
    # Пример запроса
    spark.sql("""
        SELECT date_format(tx_datetime, 'yyyy-MM') as month, 
               COUNT(*) as transactions,
               SUM(tx_fraud) as fraud_count
        FROM fraud_transactions
        GROUP BY month
        ORDER BY month
    """).show()

Начало загрузки данных из S3...
Успешно загружено 657923861 записей
Схема данных:
root
 |-- transaction_id: integer (nullable = true)
 |-- tx_datetime: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- terminal_id: integer (nullable = true)
 |-- tx_amount: double (nullable = true)
 |-- tx_time_seconds: integer (nullable = true)
 |-- tx_time_days: integer (nullable = true)
 |-- tx_fraud: integer (nullable = true)
 |-- tx_fraud_scenario: integer (nullable = true)
 |-- source_file: string (nullable = false)

+--------+---------+
|tx_fraud|    count|
+--------+---------+
|    null|       14|
|       1| 35039600|
|       0|622884247|
+--------+---------+

+-------+------------+-----------+
|  month|transactions|fraud_count|
+-------+------------+-----------+
|   null|        1365|         71|
|2019-11|    17228248|     935127|
|2019-12|    48564263|    2457912|
|2020-01|    48559625|    2488159|
|2020-02|    45418444|    2317729|
|2020-03|    48559659|    2630313|

In [7]:
    
    spark.stop()
