In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re
import tqdm
from tqdm import tqdm  


In [4]:
def extract_ids_names(file):
    f = open(file, 'r')
    json_string = f.read()
    parsed_json = json.loads(json_string)
    inner_json_string = parsed_json['applist']['apps']
    ids = []
    names = []
    if file == 'v2.json':
        for dic in inner_json_string:
            ids.append(dic['appid'])
            names.append(dic['name'])
    else:
        for dic in inner_json_string['app']:
            ids.append(dic['appid'])
            names.append(dic['name'])
    return ids, names


In [5]:

ids, names = extract_ids_names('v1.json')
ids2, names2 = extract_ids_names('v2.json')
ids.extend(ids2)
names.extend(names2)
id_df = pd.DataFrame({'ID': ids, 'Title': names }, columns=['ID', 'Title'], index =  np.arange(1, len(ids)+1))
id_df.head()
id_df.to_csv('all_steam_games.csv', index=False)



In [6]:
len(id_df)

434807

In [210]:


def extract_data(i):
        
    try:
        url = f"https://store.steampowered.com/app/{i}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
    except Exception as e:
        print(response.status_code)
        print(e)
        
    app_id = i
    title = soup.find('div', class_='apphub_AppName').text if soup.find('div', class_='apphub_AppName') else None
    description = soup.find('div', class_='game_description_snippet').text.strip() if soup.find('div', class_='game_description_snippet') else None
    try:
        content_div = soup.find('div', class_='shared_game_rating') 
        content = content_div.find('p').text.strip()
    except Exception as e:
        content = None


    try:
        developer = soup.find('div', class_="summary column", id = "developers_list").find('a').text.strip() if soup.find('div', class_="summary column", id = "developers_list") else None
        publisher = soup.find_all('div', class_='dev_row')[1].find('a').text.strip() if soup.find_all('div', class_='dev_row') else None
    except Exception as e:
        developer = None
        publisher = None

    
    try:
        genres = (soup.find('div', class_='details_block').find_all('a')) if soup.find('div', class_='details_block') else None
        genre = [g.text for g in genres]  
    except Exception as e: 
        genre = None    

    try:
        release_date = soup.find('div', class_='date').text.strip() 
    except Exception as e:
        release_date = None
    try:
        price = soup.find('div', class_='game_purchase_price').text.strip() 
    except Exception as e:
        price = None


    try:
        lines_platform = soup.find('div', class_='sysreq_tabs')
        lines = lines_platform.text.strip("'").strip().split('\n')
        software = [item.strip() for item in lines]
    except Exception as e:
        software = None
        
    try:
        player = soup.find('a', class_='game_area_details_specs_ctn').find('div', class_='label').text.strip()
    except Exception as e:
        player = None

    try:
        month_reviews = soup.find('span', class_='nonresponsive_hidden responsive_reviewdesc').text.strip()
        all_reviews = soup.find_all('span', class_='nonresponsive_hidden responsive_reviewdesc')[-1].text.strip()
        positive_ratio = re.search(r'(\d+)%', all_reviews).group(1) if 'positive' in all_reviews else None
        total_reviews = re.search(r'(\d{1,10}(?:,\d{3})*) user reviews', all_reviews).group(1).replace(',', '') if all_reviews else None
    except Exception as e:
        month_reviews = None
        positive_ratio = None
        total_reviews = None

    try:
        tags_div = soup.find('div', class_='glance_tags popular_tags').find_all('a')
        tags_all = [tag.text for tag in tags_div] if tags_div else None
        tags_list = [item.strip() for item in tags_all]
    except Exception as e:
        tags_list = None


    try:
        header_url = soup.find('img', class_='game_header_image_full')['src']
    except:
        header_url = None
    try:
        image_link = soup.find_all('a', class_='highlight_screenshot_link') 
        image_url_list = [img['href'] for img in image_link] if image_link else None
    except:
        image_url_list = None
        
    try:
        image_link_bar = soup.find_all('div', class_='highlight_strip_item highlight_strip_screenshot')
        image_small_url_list = [img.find('img')['src'] for img in image_link_bar] if image_link_bar else None
    except:
        image_small_url_list= None
        

    video_link = soup.find_all('div', class_='highlight_player_item highlight_movie')
    video_urls_hd_list = [v_link['data-mp4-hd-source'] for v_link in video_link] if video_link else None
    video_urls_480p_list = [v_link['data-mp4-source'] for v_link in video_link] if video_link else None

    # convert the data into a dictionary
    data = {
        'app_id': app_id,
        'title': title,
        'description': description,
        'content': content,
        'developer': developer,
        'publisher': publisher,
        'genre': genre,
        'release_date': release_date,
        'price': price,
        'software': software,
        'player': player,
        'month_reviews': month_reviews,
        'positive_ratio': positive_ratio,
        'total_reviews': total_reviews,
        'tags_list': tags_list,
        'header_url': header_url,
        'image_url_list': image_url_list,
        'image_small_url_list': image_small_url_list,
        'video_urls_hd_list': video_urls_hd_list,
        'video_urls_480p_list': video_urls_480p_list
    }
    return data
    


In [217]:
list_of_data = []
count = 5000

ids = id_df['ID'].head(count).to_list()

for i in tqdm(ids, desc="Extracting Data"):
    list_of_data.append(extract_data(i))
    

df = pd.DataFrame(list_of_data)

df.to_csv('steam_games_data.csv', index=False)

print("Data saved successfully to file")


Extracting Data:   0%|          | 15/5000 [00:32<2:59:21,  2.16s/it]


KeyboardInterrupt: 