In [1]:
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType

spark = SparkSession.builder.getOrCreate()

# Определение структуры схемы
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("tx_datetime", TimestampType(), True),
    StructField("customer_id", StringType(), True),
    StructField("terminal_id", StringType(), True),
    StructField("tx_amount", DoubleType(), True),
    StructField("tx_time_seconds", IntegerType(), True),
    StructField("tx_time_days", IntegerType(), True),
    StructField("tx_fraud", IntegerType(), True),
    StructField("tx_fraud_scenario", StringType(), True)
])

# Чтение текстового файла с разделителем ","
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("delimiter", ",") \
    .schema(schema) \
    .load("s3a://otus-mlops-bucket-bvo/fraud-data/")

# Вывод схемы и содержимого DataFrame
df.printSchema()
df.show(5)

root
 |-- transaction_id: string (nullable = true)
 |-- tx_datetime: timestamp (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- terminal_id: string (nullable = true)
 |-- tx_amount: double (nullable = true)
 |-- tx_time_seconds: integer (nullable = true)
 |-- tx_time_days: integer (nullable = true)
 |-- tx_fraud: integer (nullable = true)
 |-- tx_fraud_scenario: string (nullable = true)

+--------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|transaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|
+--------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|             0|2019-08-22 06:51:03|          0|        711|    70.91|          24663|           0|       0|                0|
|             1|2019-08-22 05:10:37|          0|          0|    90.55|          1863

In [11]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession

In [5]:
conf = SparkConf().setAppName("Month Stat - Python")
conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
sc = SparkContext(conf=conf)

In [6]:
sql = SQLContext(sc)

In [7]:
df = sql.read.parquet("s3a://yc-mdb-examples/dataproc/example01/set01")

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

In [12]:
spark = SparkSession\
    .builder\
    .appName("mytestapp")\
    .getOrCreate()

In [16]:
data = (spark.read.text('s3a://otus-mlops-bucket-bvo/fraud-data/2019-08-22.txt'))

root
 |-- transaction_id: string (nullable = true)
 |-- tx_datetime: timestamp (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- terminal_id: string (nullable = true)
 |-- tx_amount: double (nullable = true)
 |-- tx_time_seconds: integer (nullable = true)
 |-- tx_time_days: integer (nullable = true)
 |-- tx_fraud: integer (nullable = true)
 |-- tx_fraud_scenario: string (nullable = true)

+--------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|transaction_id|        tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|
+--------------+-------------------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|             0|2019-08-22 06:51:03|          0|        711|    70.91|          24663|           0|       0|                0|
|             1|2019-08-22 05:10:37|          0|          0|    90.55|          1863

In [43]:
df.count()

46988418

In [44]:
46988418/1879794138

0.02499657651342245

In [26]:
df = data.limit(5).toPandas()

In [29]:
df['value'].tolist()

['# tranaction_id | tx_datetime | customer_id | terminal_id | tx_amount | tx_time_seconds | tx_time_days | tx_fraud | tx_fraud_scenario',
 '0,2019-08-22 06:51:03,0,711,70.91,24663,0,0,0',
 '1,2019-08-22 05:10:37,0,0,90.55,18637,0,0,0',
 '2,2019-08-22 19:05:33,0,753,35.38,68733,0,0,0',
 '3,2019-08-22 07:21:33,0,0,80.41,26493,0,0,0']