Web scrape the top selling games in steam search web page, then use the app id from the web to find the details on the game in the Steam API. So far only getting the data on top 20 games for 1 genre (adventure), planning to add list of genres and also use another Steam API for the game tags.

In [None]:
from bs4 import BeautifulSoup
import json
import requests 
import pandas as pd
from sqlalchemy import create_engine
import re

## Web Scrape

In [None]:
# define the category to search for
category = "adventure"
search_filter = "topsellers"

In [None]:
url = "https://store.steampowered.com/search/"

In [None]:
#params, english language and nd1 = 1 is when the url is opened through google
params = {"term" : category, "supportedlang" : "english", "filter" : search_filter, "ndl" : "1"}

In [None]:
response = requests.get(url, params = params)

In [None]:
response

In [None]:
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
print(soup.prettify())

In [None]:
#find all game in top sellers
games = soup.findAll("a",attrs = {"data-gpnav":"item"})

In [None]:
games

In [None]:
#dictionary for game details
games_details = {
    "title": [],
    "price": [],
    "app_id": [],
}

#get 20 games
for game in range(20):
    app_id = games[game].attrs.get("data-ds-appid")
    title = games[game].find("span", attrs = {"class" : "title"})
    price = games[game].find("div", attrs = {"class" : "col search_price responsive_secondrow"})
    
    
    # Append the variables to the appropriate keys in the games_details dictionary. is not None so that df works
    games_details["title"].append(title.text if title is not None else None)
    games_details["price"].append(price.text.strip() if price is not None else None)
    games_details["app_id"].append(app_id if app_id is not None else None)
    
    # Print out the variables
    print("Title:", title.text)
    if price is not None:
        print("Price:", price.text.strip())
    if app_id is not None:
        print("App ID:", app_id)
    # Delimit each article
    print("-" * 40)



## API Request

In [None]:
#dictionary for game reviews
games_reviews = {
    "app_id": [],
    "review_score": [],
    "review_score_desc": [],
    "total_positive": [],
    "total_negative": [],
}

# Loop through the app IDs in the games_reviews dictionary
for app_id in games_details["app_id"]:
    games_reviews["app_id"].append(app_id)
    url = f"https://store.steampowered.com/appreviews/{app_id}?json=1"
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the response as a JSON dictionary
        data = response.json()
        # Check if the query was successful
        if data["success"] == 1:
            # Extract the information we want
            games_reviews["review_score"].append(data["query_summary"]["review_score"])
            games_reviews["review_score_desc"].append(data["query_summary"]["review_score_desc"])
            games_reviews["total_positive"].append(data["query_summary"]["total_positive"])
            games_reviews["total_negative"].append(data["query_summary"]["total_negative"])
    else:
        # Print an error message if the query was not successful
        print(f"Failed to retrieve data for app_id {app_id} with status code {response.status_code}")

In [None]:
print(games_reviews)

## Connecting to AWS Database

In [None]:
engine = create_engine("mysql+mysqldb://USER:PASSWORD@isba-dev-01.cmv8g4d5f073.us-east-1.rds.amazonaws.com/sql_project?charset=utf8")

In [None]:
#converting games_details to df
df1 = pd.DataFrame(games_details)

In [None]:
df1.to_sql("adventure_games_details", engine, if_exists='replace', index=False)

In [None]:
#converting games_reviews to df
df2 = pd.DataFrame(games_reviews)

In [None]:
df2.to_sql("adventure_games_reviews", engine, if_exists='replace', index=False)