In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DecimalType
from functools import reduce
import pyspark
import pyspark.sql.functions as f
import datetime

packages = ','.join([
    'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1',
    'com.redislabs:spark-redis_2.12:3.1.0'
])

spark = SparkSession \
        .builder \
        .appName("projeto-final-pmd-pedro-jean") \
        .config("spark.mongodb.input.uri","mongodb://mongo:27017/PMD2023.Mensagens_test") \
        .config("spark.mongodb.output.uri","mongodb://mongo:27017/PMD2023.Mensagens_test") \
        .config("spark.redis.host", "redis") \
        .config("spark.redis.port", "6379") \
        .config("spark.redis.auth", "123") \
        .config('spark.jars.packages', packages) \
        .getOrCreate()

In [None]:
initDate = datetime.datetime(2022, 4, 12).strftime("%Y-%m-%d")

# df = spark.read.options(header='True', inferSchema='True', quote="\"", escape="\"").csv('./work/Mongo/')
df = spark.read.options(header='True', inferSchema='True', quote="\"", escape="\"").csv('./work/Redis/day24.csv')

treated = df.withColumn('currency', f.when((df.money.isNotNull()) & (df.money != '0'), f.regexp_replace('money', r'(\D*)(\d+(\.|,)?\d*)', '$1')).otherwise(None)) \
    .withColumn('money',  f.regexp_replace('money', r'(\D*)(\d+(\.|,)?\d*)', '$2')) \
    .withColumn('donated', (f.col('money') != '0')) \
    .withColumn('date', f.to_date(f.lit(initDate))) \
    .withColumn('id', f.expr("uuid()"))

treated.show()

In [None]:
treated.write.format("org.apache.spark.sql.redis") \
    .option("table", "mensagens_test") \
    .option("key.column", "id") \
    .mode("overwrite") \
    .save()
    
treated.write.format('com.mongodb.spark.sql.DefaultSource').mode("overwrite").save()

In [None]:
import time


time01 = time.time()

redisDf = spark.read.format("org.apache.spark.sql.redis").option("table", "mensagens_test").option("key.column", "id").load()
redisDf.count()

redisTime = time.time() - time01

time02 = time.time()

mongoDf = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
mongoDf.count()

mongoTime = time.time() - time02

print(redisDf.count())
print(mongoDf.count())
print(f"Mongo execution time: {mongoTime}")
print(f"Redis execution time: {redisTime}")

