Objetivo: Ler todos os Games no arquivo .CSV e mapear o nome do Game + Max(ReviewID)

<h2>Importando bibliotecas</h2>

In [1]:
import findspark
findspark.init()
import pyspark


In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
import pandas as pd
import pyspark.pandas as ps



In [4]:
from pymongo import MongoClient, collection
import urllib.parse


In [5]:
import sys
sys.path.append('/home/acsantos/Documents/Facens_Architecture-for-Data-Processing/scripts/extract_reviews_from_steam_api')
from credentials import credentials
from update_last_review import update_last_review

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType

from pyspark.sql.functions import lit

from pyspark.sql import functions as f
from pyspark.sql import types as t
from datetime import datetime


## Função para listar todos os jogos (appid) mapeados na camada Silver

In [7]:
credentials = credentials()

In [8]:

def getGames():

    username = urllib.parse.quote_plus(credentials['username'])
    password = urllib.parse.quote_plus(credentials['password'])

    CONNECTION_STRING = f'mongodb://{username}:{password}@localhost:27017/steam'
    client = MongoClient(CONNECTION_STRING)
    
    appidList = list()

    with client:
        db = client.steam
        games = db.games.find()
        
        for game in games:
            
            info = {
                'appid' : game['appid'],
                'silverPath' : game['silverPath']
            }
            
            appidList.append(info)
    
    return appidList

            

In [9]:
appidList = getGames()

In [10]:
appidList[0:2]

[{'appid': '0',
  'silverPath': 'steam_reviews/reviews.parquet/app_id=0/part-00000-f2af1117-0425-4810-b828-16de4ccbf14b.c000.snappy.parquet'},
 {'appid': '1',
  'silverPath': 'steam_reviews/reviews.parquet/app_id=1/part-00000-f2af1117-0425-4810-b828-16de4ccbf14b.c000.snappy.parquet'}]

In [11]:
%time
print(len(appidList))

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.58 µs
65580


<h2> Criando sessão do spark + configurações de conexão com bucket </h2>

In [12]:
spark = SparkSession.builder.appName('ETL - Send to Gold').getOrCreate()

22/04/04 22:19:32 WARN Utils: Your hostname, moon resolves to a loopback address: 127.0.1.1; using 192.168.0.185 instead (on interface wlo1)
22/04/04 22:19:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/04 22:19:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [13]:
def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.path.style.access', 'true')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.endpoint', 'http://localhost:9000')
    spark_context._jsc.hadoopConfiguration().set('fs.s3a.connection.ssl.enabled', 'false')
    
load_config(spark.sparkContext)


<h2> Lendo dados do bucket </h2>

In [14]:

reviews_schema = StructType(
    [StructField("_c0", IntegerType(), False),
     StructField("app_id", IntegerType(), False),
     StructField("app_name", StringType(), False),
     StructField("review_id", IntegerType(), False),
     StructField("language", IntegerType(), False),
     StructField("timestamp_created", IntegerType(), False),
     StructField("timestamp_updated", IntegerType(), False),
     StructField("recommended", StringType(), False),
     StructField("votes_helpful", IntegerType(), False),
     StructField("votes_funny", IntegerType(), False),
     StructField("weighted_vote_score", IntegerType(), False),
     StructField("comment_count", IntegerType(), False),
     StructField("steam_purchase", StringType(), False),
     StructField("received_for_free", StringType(), False),
     StructField("written_during_early_access", StringType(), False),
     StructField("author.steamid", IntegerType(), False),
     StructField("author.num_games_owned", IntegerType(), False),
     StructField("author.num_reviews", IntegerType(), False),
     StructField("author.playtime_forever", IntegerType(), False),
     StructField("author.playtime_last_two_weeks", IntegerType(), False),
     StructField("author.playtime_at_review", IntegerType(), False),
     StructField("author.last_played", IntegerType(), False)])

In [15]:
import time

In [18]:
%time
    
for app in appidList:
    
    df = spark.read.parquet('s3a://silver/' + app['silverPath'], multiLine=True, header=True, schema=reviews_schema) 
    
    df = df.withColumn("appid", lit(app['appid']))
    
    df3 = df2.drop_duplicates()
    
    df4 = df3.select('appid', 'app_name', 'review_id')
    
    try:
        maxReviewID = df4.groupby().max('review_id').first().asDict()['max(review_id)']
    except:
        maxReviewID = 0
    
    try:
        game_name = df4.select('app_name').distinct().toPandas() 
    except:
        game_name = 'Não identificado'
    
    
    if len(game_name['app_name'][0]) <= 50:
        game_name = game_name['app_name'][0]
        
    else:
        game_name = 'Não identificado'
    
    update_last_review(game_id=app['appid'], game_name=game_name, last_review_retrieved=maxReviewID)  
    

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
