Objetivo: Mapear jogos de interesse e transportar para a camada Gold.

<h2>Importando bibliotecas</h2>

In [17]:
import findspark
findspark.init()
import pyspark


In [18]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [19]:
import pandas as pd
import pyspark.pandas as ps

In [20]:
from pymongo import MongoClient, collection
import urllib.parse


In [21]:
import sys
sys.path.append('/home/acsantos/Documents/Facens_Architecture-for-Data-Processing/scripts/extract_reviews_from_steam_api')
from credentials import credentials
from update_last_review import update_last_review

In [22]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType

from pyspark.sql.functions import lit

from pyspark.sql import functions as f
from pyspark.sql import types as t
from datetime import datetime


## Função para listar todos os jogos (appid) mapeados na camada Silver

In [23]:
credentials = credentials()

In [None]:

def getGames():

    username = urllib.parse.quote_plus(credentials['username'])
    password = urllib.parse.quote_plus(credentials['password'])

    CONNECTION_STRING = f'mongodb://{username}:{password}@localhost:27017/steam'
    client = MongoClient(CONNECTION_STRING)
    
    appidList = list()

    with client:
        db = client.steam
        games = db.games.find()
        
        for game in games:
            
            info = {
                'appid' : game['appid'],
                'silverPath' : game['silverPath'],
                'name' : game['name']
            }
            
            appidList.append(info)
    
    return appidList

            

In [28]:
appidList = getGames()

## Filtrando jogos cujo nome não seja numerico ou não identificado.

In [64]:
import re

games = list()

for index, game in enumerate(appidList):
    x = game['name']
    if len(re.findall("[a-zA-Z]", x)) > 0 and x != 'Não identificado':
        games.append(game)
           

<h2> Criando sessão do spark + configurações de conexão com bucket </h2>

In [71]:
spark = SparkSession.builder.appName('ETL - Send to Gold').getOrCreate()

In [72]:
def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.path.style.access', 'true')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://localhost:9000')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.connection.ssl.enabled', 'false')
    
load_config(spark.sparkContext)


<h2> Lendo dados do bucket </h2>

In [73]:

reviews_schema = StructType(
    [StructField("_c0", IntegerType(), False),
     StructField("app_id", IntegerType(), False),
     StructField("app_name", StringType(), False),
     StructField("review_id", IntegerType(), False),
     StructField("language", IntegerType(), False),
     StructField("timestamp_created", IntegerType(), False),
     StructField("timestamp_updated", IntegerType(), False),
     StructField("recommended", StringType(), False),
     StructField("votes_helpful", IntegerType(), False),
     StructField("votes_funny", IntegerType(), False),
     StructField("weighted_vote_score", IntegerType(), False),
     StructField("comment_count", IntegerType(), False),
     StructField("steam_purchase", StringType(), False),
     StructField("received_for_free", StringType(), False),
     StructField("written_during_early_access", StringType(), False),
     StructField("author.steamid", IntegerType(), False),
     StructField("author.num_games_owned", IntegerType(), False),
     StructField("author.num_reviews", IntegerType(), False),
     StructField("author.playtime_forever", IntegerType(), False),
     StructField("author.playtime_last_two_weeks", IntegerType(), False),
     StructField("author.playtime_at_review", IntegerType(), False),
     StructField("author.last_played", IntegerType(), False)])

## Enviando dados para camada Gold

In [89]:
print(f'Jogos enviados para camada Gold: {len(games)}')

Jogos enviados para camada Gold: 320


In [None]:
for game in games:
    
    df = spark.read.parquet('s3a://silver/' + game['silverPath'], multiLine=True, header=True, schema=reviews_schema) 
    
    df = df.withColumn("appid", lit(game['appid']))
    
    df.drop('_c0')
    
    df.withColumn('timestamp_created', f.date_format(df.timestamp_created.cast(dataType=t.TimestampType()), "yyyy-MM-dd")) \
      .withColumn('timestamp_updated', f.date_format(df.timestamp_updated.cast(dataType=t.TimestampType()), "yyyy-MM-dd"))
    
    df2 = df.withColumn('timestamp_created', f.to_date(df.timestamp_created.cast(dataType=t.TimestampType()))) \
            .withColumn('timestamp_updated', f.to_date(df.timestamp_updated.cast(dataType=t.TimestampType())))
    
    df2 = df2.withColumn("timestamp_created",f.to_timestamp(df2['timestamp_created'])) \
         .withColumn("timestamp_updated",f.to_timestamp(df2['timestamp_updated']))

    df3 = df2.drop_duplicates()
    
    df4 = df3.select('appid', 'app_name', 'review_id', 'language', "`author.steamid`", \
                 "`author.last_played`", "`author.num_games_owned`", "`author.playtime_forever`",\
                'recommended', "`votes_helpful`", "`votes_funny`", 'timestamp_created')
    
    
    df4.write.partitionBy('appid').mode('append').parquet('s3a://gold/steam_reviews/reviews.parquet')