<h2>Importando bibliotecas</h2>

In [45]:
import findspark
findspark.init()
import pyspark


In [46]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [47]:
import pandas as pd
import pyspark.pandas as ps

In [48]:
import sys
sys.path.append('/home/acsantos/Documents/Facens_Architecture-for-Data-Processing/scripts/minio')
from move_files import fn_move_files

<h2> Criando sessão do spark + configurações de conexão com bucket </h2>

In [49]:
spark = SparkSession.builder.appName('Steam API - Tratamento').getOrCreate()

In [50]:
def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.path.style.access', 'true')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://localhost:9000')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.connection.ssl.enabled', 'false')
    
load_config(spark.sparkContext)


<h2> Lendo dados do bucket </h2>

In [60]:
df = spark.read.json('s3a://bronze/topics/steam/*', multiLine=True)

                                                                                

In [61]:
df.printSchema()

root
 |-- appid: string (nullable = true)
 |-- comment_count: long (nullable = true)
 |-- language: string (nullable = true)
 |-- last_played: long (nullable = true)
 |-- num_games_owned: long (nullable = true)
 |-- num_reviews: long (nullable = true)
 |-- playtime_forever: long (nullable = true)
 |-- playtime_last_two_weeks: long (nullable = true)
 |-- received_for_free: boolean (nullable = true)
 |-- recommendationid: string (nullable = true)
 |-- review: string (nullable = true)
 |-- steam_purchase: boolean (nullable = true)
 |-- steamid: string (nullable = true)
 |-- timestamp_created: long (nullable = true)
 |-- timestamp_updated: long (nullable = true)
 |-- voted_up: boolean (nullable = true)
 |-- votes_funny: long (nullable = true)
 |-- votes_up: long (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- written_during_early_access: boolean (nullable = true)



In [62]:
df.count()

6

In [63]:
df.show(1, truncate=True)

+------+-------------+--------+-----------+---------------+-----------+----------------+-----------------------+-----------------+----------------+--------------------+--------------+-----------------+-----------------+-----------------+--------+-----------+--------+-------------------+---------------------------+
| appid|comment_count|language|last_played|num_games_owned|num_reviews|playtime_forever|playtime_last_two_weeks|received_for_free|recommendationid|              review|steam_purchase|          steamid|timestamp_created|timestamp_updated|voted_up|votes_funny|votes_up|weighted_vote_score|written_during_early_access|
+------+-------------+--------+-----------+---------------+-----------+----------------+-----------------------+-----------------+----------------+--------------------+--------------+-----------------+-----------------+-----------------+--------+-----------+--------+-------------------+---------------------------+
|284160|            0| english| 1650831100|         


Convertendo de timestamp UNIX para datetime


In [64]:


from pyspark.sql import functions as f
from pyspark.sql import types as t
from datetime import datetime



In [65]:


df.withColumn('last_played', f.date_format(df.last_played.cast(dataType=t.TimestampType()), "yyyy-MM-dd")) \
  .withColumn('timestamp_created', f.date_format(df.timestamp_created.cast(dataType=t.TimestampType()), "yyyy-MM-dd")) \
  .withColumn('timestamp_updated', f.date_format(df.timestamp_updated.cast(dataType=t.TimestampType()), "yyyy-MM-dd"))


df2 = df.withColumn('last_played', f.to_date(df.last_played.cast(dataType=t.TimestampType()))) \
        .withColumn('timestamp_created', f.to_date(df.timestamp_created.cast(dataType=t.TimestampType()))) \
        .withColumn('timestamp_updated', f.to_date(df.timestamp_updated.cast(dataType=t.TimestampType())))

df2 = df2.withColumn("last_played",f.to_timestamp(df2['last_played'])) \
         .withColumn("timestamp_created",f.to_timestamp(df2['timestamp_created'])) \
         .withColumn("timestamp_updated",f.to_timestamp(df2['timestamp_updated']))


Removendo possíveis registros duplicados


In [66]:
df3 = df2.drop_duplicates()


Filtrando colunas de interesse


In [67]:
df4 = df3.select('appid', 'recommendationid', 'steamid', 'language', 'last_played', 'num_games_owned', 'playtime_forever', 'review', 'voted_up', 'votes_up','timestamp_created')

In [68]:


df4.show()



+------+----------------+-----------------+--------+-------------------+---------------+----------------+--------------------+--------+--------+-------------------+
| appid|recommendationid|          steamid|language|        last_played|num_games_owned|playtime_forever|              review|voted_up|votes_up|  timestamp_created|
+------+----------------+-----------------+--------+-------------------+---------------+----------------+--------------------+--------+--------+-------------------+
|284160|       100004176|76561198359273181| english|2022-04-24 00:00:00|             85|           69138|A realistic vehic...|    true|       0|2021-09-26 00:00:00|
|285190|       100004807|76561197961181796| english|2021-10-06 00:00:00|            138|             470|for 3 sodding qui...|    true|       1|2021-09-26 00:00:00|
|284160|       100000473|76561198307100633| english|2022-04-18 00:00:00|             48|           29111|When the traffic ...|    true|       0|2021-09-26 00:00:00|
|284160|  

Renomeando coluna appid

In [70]:
df4 = df4.withColumnRenamed('appid', 'app_id')

Gravando na camada Silver

In [71]:
df4.write.partitionBy('app_id').mode('overwrite').parquet('s3a://silver/steam_reviews/reviews.parquet')


Movendo arquivos que foram lidos para pasta de processados.

In [72]:
fn_move_files(bucket='bronze', sourcePath='topics/steam/', destinationPath='processed_files/steam/')