# Abstracted Ingestion Exploration

This is a testing/exploration notebook, unused in production. Feel free to modify at will.

In [0]:
dbutils.widgets.text("content_id", "com.nianticlabs.pokemongo", "Content ID")
dbutils.widgets.text("game_name", "Pokemon Go", "Game Name")

dbutils.widgets.dropdown(
  name='content_type',
  defaultValue='Google Play Review',
  choices=['Steam Review', 'Google Play Review', 'Reddit Comment'],
  label='Content Type'
)

dbutils.widgets.dropdown(
  name='update_type',
  defaultValue='NEW_GAME',
  choices=['NEW_GAME', 'REFRESH'],
  label='Update Type'
)

In [0]:
CONTENT_ID = dbutils.widgets.get("content_id")
CONTENT_TYPE = dbutils.widgets.get("content_type")
GAME_NAME = dbutils.widgets.get("game_name")
UPDATE_TYPE = dbutils.widgets.get("update_type")

In [0]:
%pip install google-play-scraper praw pyyaml

In [0]:
%restart_python

In [0]:
from ingestion_utils import DataIngestor, SteamIngestor, GooglePlayIngestor, RedditIngestor

## SteamIngestor

In [0]:
steam_ingestor = SteamIngestor(spark)
num_reviews = 1253
reviews = steam_ingestor._get_n_reviews("1245620", num_reviews=num_reviews) # 340190, 1206340
reviews_df = steam_ingestor.ingest(app_id="1245620", game_name="Elden Ring", num_reviews=num_reviews)

# print(reviews)
print(len(reviews))
print(len(steam_ingestor._get_unique_review_ids(reviews)))
print(reviews_df.count())

In [0]:
print(reviews[0]["author"]["steamid"])

In [0]:
reviews_df.display()

In [0]:
print(reviews_df.select("content_id").distinct().count())

In [0]:
from delta.tables import DeltaTable

# Test adding to table with de-duplication using MERGE
# Note: assuming no duplicates in the new incoming dataframe.
# (If this is not the case, use df.dropDuplicates() first)
if reviews_df:
    uc_table_path = "users.thomas_xu.test_dedup"
    delta_table = DeltaTable.forName(spark, uc_table_path)

    delta_table.alias("existing") \
    .merge(
        source=reviews_df.alias("new"),
        condition="existing.content_id = new.content_id"
    ) \
    .whenNotMatchedInsertAll() \
    .execute()

    t = spark.read.table(uc_table_path)
    t.display()

In [0]:
print(t.count())

In [0]:
import requests
response = requests.get(url="https://api.steampowered.com/ISteamApps/GetAppList/v2/")
if response.status_code == 200:
    print(response.json()["applist"]["apps"])
else:
    print(f"Request failed with status code {response.status_code}: {response.text}")



In [0]:
import requests
tmp_key = "F54F566E833F25799F0528D0AB161A79"
max_results = 500_000
last_app_id = 1534610
response = requests.get(url=f"https://api.steampowered.com/IStoreService/GetAppList/v1/?key={tmp_key}&max_results={max_results}&last_appid={last_app_id}")
if response.status_code == 200:
    print(response.json()["response"]["apps"]) # note different first key "response" instead of "applist"
else:
    print(f"Request failed with status code {response.status_code}: {response.text}")

In [0]:
app_list = response.json()["response"]["apps"]
print(len(app_list))
search_term = "nba 2k"
for app in app_list:
    if search_term in app['name'].lower():
        print(app['name'])

## Google Play Ingestor
Repo: https://github.com/JoMingyu/google-play-scraper


In [0]:
# app_id = "com.minimolgames.chessarama"
# app_id = "com.hyperbeard.tsukiteahouse"
# app_id = "com.nianticlabs.pokemongo"
app_id = "com.supercell.brawlstars"

In [0]:
import google_play_scraper as GPS

def print_app_info(appId: str) -> None:
    result = GPS.app(
        appId,
        lang='en', # defaults to 'en'
        country='us' # defaults to 'us'
    )

    print(f"App ID: {result['appId']}")
    print(f"Title: {result['title']}")
    print(f"Developer: {result['developer']}")
    print(f"Genre: {result['genre']}")
    print(f"Release date: {result['released']}")
    print(f"Number of Reviews: {result['reviews']}")
    print(f"Score: {result['score']}")
    print(f"First screenshot: {result['screenshots'][0]}")

    if "headerImage" in result:
        print(f"Header image URL: {result['headerImage']}")
    else:
        print(f"Header image URL: None")

    if "icon" in result:
        print(f"Icon URL: {result['icon']}")
    else:
        print(f"Icon URL: None")
    
    if "videoImage" in result:
        print(f"Video image URL: {result['videoImage']}")
    else:
        print(f"Video image URL: None")

    print("-" * 50)

def print_results(results: list) -> None:
    # results: list, e.g. from GPS.search output
    for r in results:
        if not r['appId']:
            print(f"Skipping app: {r['title']}")
            continue
        print(r['appId'])
        print_app_info(r['appId'])

print_app_info(app_id)

In [0]:
from google_play_scraper import search, app
import google_play_scraper as GPS

results = search(
    "brawl",
    lang="en",  # defaults to 'en'
    country="us",  # defaults to 'us'
    n_hits=30  # defaults to 30 (= Google's maximum)
)

# print(results[0])
# print_results(results)
# print(len(results))

for key, value in results[0].items():
    print(f"{key}: {value}")
    # print(key)


In [0]:
reviews, continuation_token = GPS.reviews(
    app_id,
    lang='es', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=GPS.Sort.NEWEST, # defaults to Sort.NEWEST
    count=183, # defaults to 100
    filter_score_with=5 # defaults to None(means all score)
)
for review in reviews:
  print("User: " + review["userName"])
  print("\tReview: " + review["content"])
print(reviews[0])
print(len(reviews))

In [0]:
num_reviews = 643

google_play_ingestor = GooglePlayIngestor(spark)
reviews = google_play_ingestor._get_reviews(app_id, num_reviews=num_reviews)
unique_review_ids = google_play_ingestor._get_unique_review_ids(reviews)

# print(reviews)
print(len(reviews))
print(len(unique_review_ids))


In [0]:
reviews_df = google_play_ingestor.ingest(app_id, app_info["title"], 135)
reviews_df.display()

## Reddit Ingestor

In [0]:
import yaml

# Get scope and secret names from config file
with open('config/config.yaml', 'r') as config_file:
    # Load the YAML content using safe_load for security
    config_data = yaml.safe_load(config_file)

scope_name = config_data['secrets']['scope_name']
reddit_secret_keys = config_data['secrets']['keys']['reddit']
reddit_client_id_secret_key = reddit_secret_keys['client_id']
reddit_client_secret_secret_key = reddit_secret_keys['client_secret']
reddit_user_agent_secret_key = reddit_secret_keys['user_agent']

print(scope_name)
print(reddit_client_id_secret_key)
print(reddit_client_secret_secret_key)
print(reddit_user_agent_secret_key)

In [0]:
# Handle secrets
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()

# w.secrets.create_scope(scope=scope_name)
# w.secrets.put_secret(scope=scope_name, key=reddit_client_id_secret_key, string_value="")
# w.secrets.put_secret(scope=scope_name, key=reddit_client_secret_secret_key, string_value="")
# w.secrets.put_secret(scope=scope_name, key=reddit_user_agent_secret_key, string_value="")

secrets = w.secrets.list_secrets(scope=scope_name)
print(secrets)

# client_id = w.secrets.get_secret(scope=scope_name, key=reddit_user_agent_secret_key)
# client_secret = w.secrets.get_secret(scope=scope_name, key=reddit_client_secret_secret_key)
# user_agent = w.secrets.get_secret(scope=scope_name, key=reddit_user_agent_secret_key)
# print(base64.b64decode(client_id.value).decode('utf-8'))

client_id = dbutils.secrets.get(scope=scope_name, key=reddit_client_id_secret_key)
client_secret = dbutils.secrets.get(scope=scope_name, key=reddit_client_secret_secret_key)
user_agent = dbutils.secrets.get(scope=scope_name, key=reddit_user_agent_secret_key)

# print(client_id[:-1])
# print(client_secret[:-1])
# print(user_agent[:-1])

In [0]:
reddit_ingestor = RedditIngestor(spark, client_id=client_id, client_secret=client_secret, user_agent=user_agent)
post_rows, comment_rows = reddit_ingestor._get_content(subreddit_name="GTA", max_posts=10)
output_df = reddit_ingestor.ingest(subreddit_name="GTA", max_posts=10)

In [0]:
print("unique_comments: " + str(len(reddit_ingestor._get_unique_comment_ids(comment_rows))))

In [0]:
output_df.display()

In [0]:
print(output_df.select("content_id").distinct().count())

### Subreddit Search

In [0]:
import praw
reddit_client = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent,
            check_for_async=False
        )

In [0]:
from datetime import datetime, timezone

query = "games like street fighter"
subreddits = reddit_client.subreddits.search(query, limit=100) # will get up to 100 results, Subreddits are searched by both their title and description; example valid queries: "project m smash bros", "videogames", "pokemon stuff"

# Note: the search_by_name function does not perform as expected based on documentation. 
# There does not appear to be a limit parameter, and searches like "dark" return only a few results and miss many other valid subreddits.
# subreddits = reddit_client.subreddits.search_by_name(query) # find subreddits that explicitly start with the search term (e.g. searching "dark" will find "darksouls")

num_subreddits_found = 0
for subreddit in subreddits:
    # if not subreddit.community_icon or not subreddit.icon_img:
    #     continue

    print("-" * 100)
    print(subreddit.display_name)
    print("\tID:", subreddit.fullname)
    print("\tSubscribers:", subreddit.subscribers)
    print(f"\tCreated: {subreddit.created}, {datetime.fromtimestamp(subreddit.created_utc, timezone.utc).date()}")
    print("\tNSFW:", subreddit.over18)
    print("\tURL: ", subreddit.url)

    print(f"\tIcon: {subreddit.icon_img} (size: {subreddit.icon_size})")
    print("\tHeader Title:", subreddit.header_title)
    print(f"\tHeader Img: {subreddit.header_img} (size: {subreddit.header_size})")
    print(f"\tBanner Img: {subreddit.banner_img} (size: {subreddit.banner_size}, bg image: {subreddit.banner_background_image})")
    print(f"\tMobile Banner Img: {subreddit.mobile_banner_image}")
    print("\tPublic Traffic:", subreddit.public_traffic)
    # print("\tDescription:", subreddit.description)
    # print("\tPublic Description:", subreddit.public_description)

    # Access the subreddit icon URL
    print("\tImg:")
    print(f"\t\tCommunity Icon: {subreddit.community_icon}")
    print(f"\t\tIcon Img: {subreddit.icon_img}")

    # print(dir(subreddit))
    num_subreddits_found += 1

print("Num subreddits found: " + str(num_subreddits_found))

In [0]:
from datetime import datetime, timezone
d = datetime.fromtimestamp(1438097278.0, timezone.utc).date()

## Other

In [0]:
%sql
SELECT DISTINCT game_name, content_type FROM bbyam_demo.player_feedback_v3.feedback_content_gold

In [0]:
import pandas

query = """
    SELECT DISTINCT game_name, content_type FROM bbyam_demo.player_feedback_v3.feedback_content_gold
    """

df = spark.sql(query)
pandas_df = df.toPandas()
pandas_df.display()

In [0]:
dict_list = pandas_df.to_dict(
    orient='records'
)
print(dict_list)

In [0]:
# col_dict = pandas_df.to_dict(
#     orient='list'
# )
# print(col_dict)