<a href="https://colab.research.google.com/github/bhavya6701/COMP333-Data-Analytics/blob/main/COMP333_Project_Data_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP 333 - Project - Video Games Analysis
Team members:
- Bhavya Manjibhai Ruparelia (40164863)
- Devansh Nileshkumar Vaidya (40165987)
- Yao Hua Shan (40130771)
- Yash Patel (40175454)

In [None]:
# standard library imports
import time
import urllib
from ssl import SSLError

# third-party imports
import pandas as pd
import requests

In [None]:
def get_request(url, parameters=None):
    """
    Send a GET request to the specified URL with the specified parameters.
    :param url: URL to send the request to
    :param parameters: Parameters to send with the request
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)

        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' ' * 10)

        # Recursively try again
        return get_request(url, parameters)

    if response:
        return response.json()
    else:
        # Response is none usually means too many requests. Wait and try again
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [None]:
# Import data from Kaggle - Steam dataset
games_df = pd.read_csv('./games.csv')
print(f'Total games: {games_df.shape[0]}')

# Games released in 2023
games_df = games_df[games_df['date_release'].str.contains('2023')]
print(f'Games released in 2023: {games_df.shape[0]}')

In [None]:
# Request reviews for each game
reviews = pd.DataFrame()
num_per_page = 100
excluded = []
index = 1
for _, game in games_df.iterrows():
    id = game['app_id']
    title = game['title']
    pages = 1
    num_reviews = 0
    cursor = '*'
    prev_cursor = '_'
    flag = True

    # Traverse through all the pages of reviews
    while cursor is not None or pages == 1:
        url = f'https://store.steampowered.com/appreviews/{id}?json=1&filter=recent&cursor={cursor}&num_per_page={num_per_page}'
        data = get_request(url)

        # The games should have minimum of 200 reviews
        if flag and data['query_summary']['total_reviews'] < 200:
            break
        flag = False

        # If cursor is not changing, it means we have reached the end of the reviews
        prev_cursor = cursor
        cursor = None if data['cursor'] is None else urllib.parse.quote_plus(data['cursor'])
        if cursor == prev_cursor:
            break

        # Add the reviews to the dataframe
        new_reviews = pd.DataFrame(data['reviews'])
        reviews = pd.concat([reviews, new_reviews])
        num_reviews += new_reviews.shape[0]

        pages += 1
        if pages == 101:
            break

    if num_reviews < 200:
        excluded.append(title)
    else:
        print(f'{index}. Pages Received: {pages} | Total reviews extracted for {title}: {num_reviews}')
        index += 1

print(f'Excluded {excluded} games (Not enough reviews < 200)')

In [None]:
# Print the number of games excluded released in the year 2023
print(f'Number of games excluded: {len(excluded)}')

In [None]:
# Export the reviews dataframe to csv
reviews.to_csv('./games_2023_reviews_.csv', index=False)