In [1]:
import os
import requests
import selenium
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
import webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re
import time
import tqdm
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
def get_AppList(version):
    url = f"https://api.steampowered.com/ISteamApps/GetAppList/{version}/"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        apps = data['applist']['apps']
        if version == 'v2':
            return apps            
        else:
            return apps['app']
    except requests.exceptions.RequestException as e:
        return None

In [23]:
v1 = 'v1'
v2 = 'v2'

apps = get_AppList(v1)
df1 = pd.DataFrame(apps)

apps = get_AppList(v2)
df2 = pd.DataFrame(apps)

app_ids_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset='appid')  
app_ids_df.to_csv('all_steam_games.csv', index=False)

In [24]:
app_ids_df.tail()

Unnamed: 0,appid,name
219680,2884090,CAMPS Demo
219681,1440670,Zeepkist
219682,2827930,Critical Fishing
219683,3276080,Iron Cauldron - Guess the Colorblock
219684,3286030,Boardguard


In [5]:
df1.head()

Unnamed: 0,appid,name
0,1941401,
1,1897482,
2,2112761,
3,2016512,
4,1820332,


In [6]:
df2.head()

Unnamed: 0,appid,name
0,1941401,
1,1897482,
2,2112761,
3,2016512,
4,1820332,


In [7]:
def get_AppData(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    except Exception as e:
        print(e)
        return None

In [8]:
def find_general_details(soup):
    """ 
        Input: soup (data of html)
        Output: title, description, content and genre of the game
    """
    
    title = soup.find('div', class_='apphub_AppName').text if soup.find('div', class_='apphub_AppName') else None
    description = soup.find('div', class_='game_description_snippet').text.strip() if soup.find('div', class_='game_description_snippet') else None
    try:
        content_div = soup.find('div', class_='shared_game_rating') 
        content = content_div.find('p').text.strip()
    except Exception as e:
        content = None
    try:
        genres = (soup.find('div', class_='details_block').find_all('a')) if soup.find('div', class_='details_block') else None
        genre = [g.text for g in genres]  
    except Exception as e: 
        genre = None   
    try:
        release_date = soup.find('div', class_='date').text.strip().replace(',', '')
    except Exception as e:
        release_date = None
        
    try:
        tags_div = soup.find('div', class_='glance_tags popular_tags').find_all('a')
        tags_all = [tag.text for tag in tags_div] if tags_div else None
        tags_list = [item.strip() for item in tags_all]
    except Exception as e:
        tags_list = None
        
    try:
        player = soup.find('a', class_='game_area_details_specs_ctn').find('div', class_='label').text.strip()
    except Exception as e:
        player = None
        
    return title, description, content, genre, player, tags_list, release_date

In [9]:
def extract_details(div, index):
    """Helper function to extract developer or publisher details."""
    try:
        detail_div = div[index].find('div', class_='summary column')
        link = detail_div.find('a').get('href')
        name = detail_div.text.strip()
        return link, name
    except (AttributeError, IndexError) as e:
        return None, None

def fetch_followers(link):
    """Helper function to fetch followers from a link."""
    if not link:
        return None
    try:
        soup = get_AppData(link)
        return soup.find('div', class_="num_followers").text.strip().replace(',', '')
    except Exception as e:
        return None

def find_developer_publisher_details(soup):
    """Find developer and publisher details and their followers."""
    try:
        divs = soup.find('div', class_="glance_ctn_responsive_left").find_all('div', 'dev_row')
        
        dev_link, developer = extract_details(divs, 0)
        pub_link, publisher = extract_details(divs, 1)

        dev_followers = fetch_followers(dev_link)
        pub_followers = fetch_followers(pub_link)

    except Exception as e:
        developer, publisher, dev_followers, pub_followers = None, None, None, None

    return developer, publisher, dev_followers, pub_followers


In [10]:
def find_price(soup):
    # Extract the regular price
    try:
        price = soup.find('div', class_='game_purchase_price price').text.replace('$', '').replace(',', '').replace('USD', '').strip()
    except AttributeError:
        price = None

    # Extract discount prices
    discount_prices = []
    try:
        discount_divs = soup.find_all('div', class_='discount_final_price')
        discount_prices = [dp.text.replace('$', '').replace(',', '').replace('USD', '').strip() for dp in discount_divs]
    except Exception as e:
        print(f"Error extracting discount prices: {e}")

    return price, discount_prices if discount_prices else None


In [11]:
def find_review_count(soup):
    try:
        review_div = soup.find_all('span', class_ = "nonresponsive_hidden responsive_reviewdesc")
        month = review_div[0]
        all_time = review_div[1]
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', month.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_month = parsed_numbers[0] 
        month_reviews = parsed_numbers[1]          
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', all_time.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_all_time = parsed_numbers[0] 
        total_reviews = parsed_numbers[1]
        
    except Exception as e:
        positive_review_ratio_month = None
        month_reviews = None
        positive_review_ratio_all_time = None
        total_reviews = None
        
    return month_reviews, positive_review_ratio_month, total_reviews, positive_review_ratio_all_time

In [12]:
def find_media_links(soup):
    try:
        header_url = soup.find('img', class_='game_header_image_full')['src']
    except:
        header_url = None
    try:
        image_link = soup.find_all('a', class_='highlight_screenshot_link') 
        image_url_list = [img['href'] for img in image_link] if image_link else None
    except:
        image_url_list = None
        
    try:
        image_link_bar = soup.find_all('div', class_='highlight_strip_item highlight_strip_screenshot')
        image_small_url_list = [img.find('img')['src'] for img in image_link_bar] if image_link_bar else None
    except:
        image_small_url_list= None
        

    video_link = soup.find_all('div', class_='highlight_player_item highlight_movie')
    video_urls_hd_list = [v_link['data-mp4-hd-source'] for v_link in video_link] if video_link else None
    video_urls_480p_list = [v_link['data-mp4-source'] for v_link in video_link] if video_link else None
    
    return header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list

In [13]:
def find_requirements(soup):
    try:
        lines_platform = soup.find('div', class_='sysreq_tabs')
        lines = lines_platform.text.strip("'").strip().split('\n')
        software = [item.strip() for item in lines]
    except Exception as e:
        software = None
        
    
    return software

In [14]:
def find_languages(soup):
    return [td.get_text(strip=True) for td in soup.select("td.ellipsis")]

In [15]:
def extract_data(appid):
    
    time.sleep(1) 
    url = f"https://store.steampowered.com/app/{appid}/"
    soup = get_AppData(url) 
    
    
    try:
        app_id = appid
        title, description, content, genre, player, tags_list, release_date = find_general_details(soup)
        developer, publisher, dev_followers, pub_followers = find_developer_publisher_details(soup) 
        price, discount_prices = find_price(soup)
        month_reviews, pos_ratio_month, total_reviews, pos_ratio_all = find_review_count(soup)
        header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list = find_media_links(soup)   
        software = find_requirements(soup)
        languages = find_languages(soup)
        data = {
        'app_id': app_id,
        'title': title,
        'description': description,
        'content': content,
        'developer': developer,
        'publisher': publisher,
        'dev_followers':dev_followers,
        'pub_followers' : pub_followers,
        'genre': genre,
        'release_date': release_date,
        'price_usd': price,
        'discount_price':discount_prices,
        'software': software,
        'player': player,
        'languages' : languages,
        'month_reviews': month_reviews,
        'positive_ratio_month': pos_ratio_month,  
        'total_reviews': total_reviews,
        'positive_ratio_all': pos_ratio_all,
        'tags_list': tags_list,
        'header_url': header_url,
        'image_url_list': image_url_list,
        'image_small_url_list': image_small_url_list,
        'video_urls_hd_list': video_urls_hd_list,
        'video_urls_480p_list': video_urls_480p_list
    }
    except Exception as e:
        print(e)
        data = None

    
    return data
    
    


In [16]:
test_id = ['2878980', '10', '1293830', '306130']

In [17]:
for i in test_id:
    display(extract_data(i))

{'app_id': '2878980',
 'title': 'NBA 2K25',
 'description': 'Command every court with authenticity and realism Powered by ProPLAY™, giving you ultimate control over how you play in NBA 2K25. Define your legacy in MyCAREER, MyTEAM, MyNBA, and The W.',
 'content': 'In-Game Purchases\r\nIn-Game Purchases (Includes Random Items)',
 'developer': 'Visual Concepts',
 'publisher': '2K',
 'dev_followers': None,
 'pub_followers': '355698',
 'genre': ['Simulation', 'Sports', 'Visual Concepts', '2K', 'NBA 2K'],
 'release_date': '3 Sep 2024',
 'price_usd': '69.96',
 'discount_price': ['36.74', '52.49'],
 'software': None,
 'player': 'Single-player',
 'languages': ['English',
  'French',
  'Italian',
  'German',
  'Spanish - Spain',
  'Japanese',
  'Korean',
  'Simplified Chinese',
  'Traditional Chinese'],
 'month_reviews': 1738,
 'positive_ratio_month': 57,
 'total_reviews': 5974,
 'positive_ratio_all': 57,
 'tags_list': ['Sports',
  'Basketball',
  'Simulation',
  'eSports',
  '3D',
  'Immersive 

{'app_id': '10',
 'title': 'Counter-Strike',
 'description': "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.",
 'content': 'Online interactivity, In-game chat',
 'developer': 'Valve',
 'publisher': 'Valve',
 'dev_followers': '761219',
 'pub_followers': '761219',
 'genre': ['Action', 'Valve', 'Valve'],
 'release_date': '1 Nov 2000',
 'price_usd': '5.49',
 'discount_price': ['7.14', '73.63', '8.66'],
 'software': ['Windows', '', 'macOS', '', 'SteamOS + Linux'],
 'player': 'Online PvP',
 'languages': ['English',
  'French',
  'German',
  'Italian',
  'Spanish - Spain',
  'Simplified Chinese',
  'Traditional Chinese',
  'Korean'],
 'month_reviews': 877,
 'positive_ratio_month': 96,
 'total_reviews': 153355,
 'positive_ratio_a

{'app_id': '1293830',
 'title': 'Forza Horizon 4',
 'description': 'Dynamic seasons change everything at the world’s greatest automotive festival. Go it alone or team up with others to explore beautiful and historic Britain in a shared open world.',
 'content': 'Online',
 'developer': 'Playground Games',
 'publisher': 'Xbox Game Studios',
 'dev_followers': '721319',
 'pub_followers': '721319',
 'genre': ['Racing', 'Playground Games', 'Xbox Game Studios', 'Forza'],
 'release_date': '9 Mar 2021',
 'price_usd': None,
 'discount_price': ['3.99', '5.19', '6.39'],
 'software': None,
 'player': 'Single-player',
 'languages': ['English',
  'French',
  'Italian',
  'German',
  'Spanish - Spain',
  'Czech',
  'Hungarian',
  'Japanese',
  'Korean',
  'Polish',
  'Portuguese - Brazil',
  'Russian',
  'Simplified Chinese',
  'Spanish - Latin America',
  'Traditional Chinese',
  'Turkish'],
 'month_reviews': 7008,
 'positive_ratio_month': 92,
 'total_reviews': 238996,
 'positive_ratio_all': 90,
 'ta

{'app_id': '306130',
 'title': 'The Elder Scrolls® Online',
 'description': 'Join over 22 million players in the award-winning online multiplayer RPG and experience limitless adventure in a persistent Elder Scrolls world. Battle, craft, steal, or explore, and combine different types of equipment and abilities to create your own style of play. No game subscription required.',
 'content': 'Violence\r\nIn-Game Purchases (Includes Random Items)',
 'developer': 'ZeniMax Online Studios',
 'publisher': 'Bethesda Softworks',
 'dev_followers': '1083388',
 'pub_followers': '1083388',
 'genre': ['Action',
  'Adventure',
  'Massively Multiplayer',
  'RPG',
  'ZeniMax Online Studios',
  'Bethesda Softworks',
  'The Elder Scrolls'],
 'release_date': '4 Apr 2014',
 'price_usd': '19.99',
 'discount_price': None,
 'software': ['Windows', '', 'macOS'],
 'player': 'MMO',
 'languages': ['English',
  'French',
  'German',
  'Russian',
  'Spanish - Spain',
  'Simplified Chinese'],
 'month_reviews': 659,
 'p

In [21]:



def save_batch_to_csv(batch_number, df):
    os.makedirs('scraped_data', exist_ok=True)
    file_path = f'scraped_data/steam_games_data_batch_{batch_number}.csv'
    df.to_csv(file_path, index=False)
    print(f"Batch {batch_number} saved successfully to {file_path}")

def process_batch(batch_number, app_ids):
    list_of_data = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_data, app_id) for app_id in app_ids]
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing Batch {batch_number}"):
            list_of_data.append(future.result())

    df_scraped = pd.DataFrame(list_of_data)
    save_batch_to_csv(batch_number, df_scraped)

# Assuming app_ids_df is already defined and contains your data
a = 8 * 1000  # Starting from the 9th batch if 8 batches are done
b = len(app_ids_df)
batch_size = 1000

for batch_start in range(a, b, batch_size):
    batch_number = batch_start // batch_size + 1
    ids = app_ids_df['appid'][batch_start:batch_start + batch_size].to_list()
    process_batch(batch_number, ids)


Extracting Data Batch 8: 100%|███████████████████████████████████████████████████████| 100/100 [07:49<00:00,  4.70s/it]


Batch 8 saved successfully to scraped_data/steam_games_data_batch_8.csv


Extracting Data Batch 108: 100%|█████████████████████████████████████████████████████| 100/100 [08:45<00:00,  5.25s/it]


Batch 108 saved successfully to scraped_data/steam_games_data_batch_108.csv


Extracting Data Batch 208: 100%|█████████████████████████████████████████████████████| 100/100 [08:40<00:00,  5.21s/it]


Batch 208 saved successfully to scraped_data/steam_games_data_batch_208.csv


Extracting Data Batch 308:   0%|                                                               | 0/100 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:
url = "https://store.steampowered.com/charts/topselling/global" 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url) 


In [None]:
tbody_element = driver.find_element(By.TAG_NAME, "tbody")
tbody_html = tbody_element.get_attribute("outerHTML")
soup = BeautifulSoup(tbody_html, 'lxml')

In [None]:
app_ids = []
ranks = []
urls = []

# Loop through each row in the <tbody> tag
for row in soup.find_all('tr', class_="_2-RN6nWOY56sNmcDHu069P"):
    # Extract the rank
    rank = row.find('td', class_="_34h48M_x9S-9Q2FFPX_CcU").get_text(strip=True)
    ranks.append(rank)
    
    # Extract the URL and app id
    link = row.find('a', class_="_2C5PJOUH6RqyuBNEwaCE9X")
    if link:
        url = link['href']
        urls.append(url)
        
        # Extract app id from the URL using regex
        app_id = re.search(r'/app/(\d+)/', url)
        app_ids.append(app_id.group(1) if app_id else None)

# Create a DataFrame
df_top_100 = pd.DataFrame({
    'appid': app_ids,
    'rank': ranks,
    'url': urls
})

# Save the DataFrame to a CSV file
df_top_100.to_csv("top_100_games.csv", index=False)

print("Data saved to top_100_games.csv")

In [None]:
df_top_100.tail()

In [None]:
df_100 = pd.read_csv('top_100_games.csv')

In [None]:
df1_unique = app_ids_df.drop_duplicates(subset='appid')
df2_unique = df_100.drop_duplicates(subset='appid')
common_apps = pd.merge(df1_unique, df2_unique, on='appid', how='inner').sort_values(by='rank', ascending=True) 

In [None]:
common_apps

In [None]:
import os
import requests
import selenium
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
import webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re
import tqdm
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


def get_AppList(version):
    url = f"https://api.steampowered.com/ISteamApps/GetAppList/{version}/"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        apps = data['applist']['apps']
        if version == 'v2':
            return apps            
        else:
            return apps['app']
    except requests.exceptions.RequestException as e:
        return None

v1 = 'v1'
v2 = 'v2'

apps = get_AppList(v1)
df1 = pd.DataFrame(apps)

apps = get_AppList(v2)
df2 = pd.DataFrame(apps)

app_ids_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset='appid')  
app_ids_df.to_csv('all_steam_games.csv', index=False)

app_ids_df.tail()

df1.head()

df2.head()

def get_AppData(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    except Exception as e:
        print(e)
        return None

def find_general_details(soup):
    """ 
        Input: soup (data of html)
        Output: title, description, content and genre of the game
    """
    
    title = soup.find('div', class_='apphub_AppName').text if soup.find('div', class_='apphub_AppName') else None
    description = soup.find('div', class_='game_description_snippet').text.strip() if soup.find('div', class_='game_description_snippet') else None
    try:
        content_div = soup.find('div', class_='shared_game_rating') 
        content = content_div.find('p').text.strip()
    except Exception as e:
        content = None
    try:
        genres = (soup.find('div', class_='details_block').find_all('a')) if soup.find('div', class_='details_block') else None
        genre = [g.text for g in genres]  
    except Exception as e: 
        genre = None   
    try:
        release_date = soup.find('div', class_='date').text.strip().replace(',', '')
    except Exception as e:
        release_date = None
        
    try:
        tags_div = soup.find('div', class_='glance_tags popular_tags').find_all('a')
        tags_all = [tag.text for tag in tags_div] if tags_div else None
        tags_list = [item.strip() for item in tags_all]
    except Exception as e:
        tags_list = None
        
    try:
        player = soup.find('a', class_='game_area_details_specs_ctn').find('div', class_='label').text.strip()
    except Exception as e:
        player = None
        
    return title, description, content, genre, player, tags_list, release_date

def extract_details(div, index):
    """Helper function to extract developer or publisher details."""
    try:
        detail_div = div[index].find('div', class_='summary column')
        link = detail_div.find('a').get('href')
        name = detail_div.text.strip()
        return link, name
    except (AttributeError, IndexError) as e:
        return None, None

def fetch_followers(link):
    """Helper function to fetch followers from a link."""
    if not link:
        return None
    try:
        soup = get_AppData(link)
        return soup.find('div', class_="num_followers").text.strip().replace(',', '')
    except Exception as e:
        return None

def find_developer_publisher_details(soup):
    """Find developer and publisher details and their followers."""
    try:
        divs = soup.find('div', class_="glance_ctn_responsive_left").find_all('div', 'dev_row')
        
        dev_link, developer = extract_details(divs, 0)
        pub_link, publisher = extract_details(divs, 1)

        dev_followers = fetch_followers(dev_link)
        pub_followers = fetch_followers(pub_link)

    except Exception as e:
        developer, publisher, dev_followers, pub_followers = None, None, None, None

    return developer, publisher, dev_followers, pub_followers


def find_price(soup):
    # Extract the regular price
    try:
        price = soup.find('div', class_='game_purchase_price price').text.replace('$', '').replace(',', '').replace('USD', '').strip()
    except AttributeError:
        price = None

    # Extract discount prices
    discount_prices = []
    try:
        discount_divs = soup.find_all('div', class_='discount_final_price')
        discount_prices = [dp.text.replace('$', '').replace(',', '').replace('USD', '').strip() for dp in discount_divs]
    except Exception as e:
        print(f"Error extracting discount prices: {e}")

    return price, discount_prices if discount_prices else None


def find_review_count(soup):
    try:
        review_div = soup.find_all('span', class_ = "nonresponsive_hidden responsive_reviewdesc")
        month = review_div[0]
        all_time = review_div[1]
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', month.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_month = parsed_numbers[0] 
        month_reviews = parsed_numbers[1]          
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', all_time.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_all_time = parsed_numbers[0] 
        total_reviews = parsed_numbers[1]
        
    except Exception as e:
        positive_review_ratio_month = None
        month_reviews = None
        positive_review_ratio_all_time = None
        total_reviews = None
        
    return month_reviews, positive_review_ratio_month, total_reviews, positive_review_ratio_all_time

def find_media_links(soup):
    try:
        header_url = soup.find('img', class_='game_header_image_full')['src']
    except:
        header_url = None
    try:
        image_link = soup.find_all('a', class_='highlight_screenshot_link') 
        image_url_list = [img['href'] for img in image_link] if image_link else None
    except:
        image_url_list = None
        
    try:
        image_link_bar = soup.find_all('div', class_='highlight_strip_item highlight_strip_screenshot')
        image_small_url_list = [img.find('img')['src'] for img in image_link_bar] if image_link_bar else None
    except:
        image_small_url_list= None
        

    video_link = soup.find_all('div', class_='highlight_player_item highlight_movie')
    video_urls_hd_list = [v_link['data-mp4-hd-source'] for v_link in video_link] if video_link else None
    video_urls_480p_list = [v_link['data-mp4-source'] for v_link in video_link] if video_link else None
    
    return header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list

def find_requirements(soup):
    try:
        lines_platform = soup.find('div', class_='sysreq_tabs')
        lines = lines_platform.text.strip("'").strip().split('\n')
        software = [item.strip() for item in lines]
    except Exception as e:
        software = None
        
    
    return software

def find_languages(soup):
    return [td.get_text(strip=True) for td in soup.select("td.ellipsis")]

def extract_data(appid):
    url = f"https://store.steampowered.com/app/{appid}/"
    soup = get_AppData(url) 
    
    try:
        app_id = appid
        title, description, content, genre, player, tags_list, release_date = find_general_details(soup)
        developer, publisher, dev_followers, pub_followers = find_developer_publisher_details(soup) 
        price, discount_prices = find_price(soup)
        month_reviews, pos_ratio_month, total_reviews, pos_ratio_all = find_review_count(soup)
        header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list = find_media_links(soup)   
        software = find_requirements(soup)
        languages = find_languages(soup)
        data = {
        'app_id': app_id,
        'title': title,
        'description': description,
        'content': content,
        'developer': developer,
        'publisher': publisher,
        'dev_followers':dev_followers,
        'pub_followers' : pub_followers,
        'genre': genre,
        'release_date': release_date,
        'price_usd': price,
        'discount_price':discount_prices,
        'software': software,
        'player': player,
        'languages' : languages,
        'month_reviews': month_reviews,
        'positive_ratio_month': pos_ratio_month,  
        'total_reviews': total_reviews,
        'positive_ratio_all': pos_ratio_all,
        'tags_list': tags_list,
        'header_url': header_url,
        'image_url_list': image_url_list,
        'image_small_url_list': image_small_url_list,
        'video_urls_hd_list': video_urls_hd_list,
        'video_urls_480p_list': video_urls_480p_list
    }
    except Exception as e:
        print(e)
        data = None

    
    return data
    
    


test_id = ['2878980', '10', '1293830', '306130']

for i in test_id:
    display(extract_data(i))


# Placeholder for the actual data extraction function
def extract_data(app_id):
    import time
    time.sleep(1)  # Simulate network delay
    return {'app_id': app_id, 'data': 'Sample data'}

def save_batch_to_csv(batch_number, df):
    os.makedirs('scraped_data', exist_ok=True)
    file_path = f'scraped_data/steam_games_data_batch_{batch_number}.csv'
    df.to_csv(file_path, index=False)
    print(f"Batch {batch_number} saved successfully to {file_path}")

def process_batch(batch_number, app_ids):
    list_of_data = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_data, app_id) for app_id in app_ids]
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing Batch {batch_number}"):
            list_of_data.append(future.result())

    df_scraped = pd.DataFrame(list_of_data)
    save_batch_to_csv(batch_number, df_scraped)

# Assuming app_ids_df is already defined and contains your data
a = 8 * 1000  # Starting from the 9th batch if 8 batches are done
b = len(app_ids_df)
batch_size = 1000

for batch_start in range(a, b, batch_size):
    batch_number = batch_start // batch_size + 1
    ids = app_ids_df['appid'][batch_start:batch_start + batch_size].to_list()
    process_batch(batch_number, ids)


url = "https://store.steampowered.com/charts/topselling/global" 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url) 


tbody_element = driver.find_element(By.TAG_NAME, "tbody")
tbody_html = tbody_element.get_attribute("outerHTML")
soup = BeautifulSoup(tbody_html, 'lxml')

app_ids = []
ranks = []
urls = []

# Loop through each row in the <tbody> tag
for row in soup.find_all('tr', class_="_2-RN6nWOY56sNmcDHu069P"):
    # Extract the rank
    rank = row.find('td', class_="_34h48M_x9S-9Q2FFPX_CcU").get_text(strip=True)
    ranks.append(rank)
    
    # Extract the URL and app id
    link = row.find('a', class_="_2C5PJOUH6RqyuBNEwaCE9X")
    if link:
        url = link['href']
        urls.append(url)
        
        # Extract app id from the URL using regex
        app_id = re.search(r'/app/(\d+)/', url)
        app_ids.append(app_id.group(1) if app_id else None)

# Create a DataFrame
df_top_100 = pd.DataFrame({
    'appid': app_ids,
    'rank': ranks,
    'url': urls
})

# Save the DataFrame to a CSV file
df_top_100.to_csv("top_100_games.csv", index=False)

print("Data saved to top_100_games.csv")

df_top_100.tail()

df_100 = pd.read_csv('top_100_games.csv')



df1_unique = app_ids_df.drop_duplicates(subset='appid')
df2_unique = df_100.drop_duplicates(subset='appid')
common_apps = pd.merge(df1_unique, df2_unique, on='appid', how='inner').sort_values(by='rank', ascending=True) 

common_apps

