In [10]:
pip install requests beautifulsoup4 pandas fake-useragent

Defaulting to user installation because normal site-packages is not writeable
Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
Installing collected packages: fake-useragent
Successfully installed fake-useragent-2.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
        ]

# Safe Code only 1 pages

In [31]:
import os
import glob
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time

# Konfigurasi Folder
INPUT_DIR = 'OutputData'
OUTPUT_DIR = 'SteamReviews'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Konfigurasi Scraping
BASE_URL = "https://steamcommunity.com/profiles/"
HEADERS = {'User-Agent': UserAgent().random}
DELAY = 5  # Delay anti-ban (detik)

def scrape_user_reviews(steam64_id):
    url = f"{BASE_URL}{steam64_id}/recommended/"
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            return [{"Error": f"HTTP {response.status_code}", "UserID": steam64_id}]

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Validasi apakah halaman mengandung review
        if not soup.find('div', class_='review_box'):
            return [{"Status": "No reviews found", "UserID": steam64_id}]

        reviews = []
        for review in soup.find_all('div', class_='review_box'):
            try:
                game_elem = review.find('a', class_='hover_item_name')
                rating_elem = review.find('div', class_='title')
                content_elem = review.find('div', class_='content')
                hours_elem = review.find('div', class_='hours')

                reviews.append({
                    'UserID': steam64_id,
                    'Game': game_elem.get_text(strip=True) if game_elem else "N/A",
                    'Rating': "Recommended" if rating_elem and 'Recommended' in rating_elem.text else "Not Recommended",
                    'Review': content_elem.get_text(strip=True, separator=' ') if content_elem else "N/A",
                    'Playtime': hours_elem.get_text(strip=True) if hours_elem else "N/A"
                })
            except Exception as e:
                print(f"Error parsing review for {steam64_id}: {str(e)}")
                continue
        
        return reviews if reviews else [{"Status": "No valid reviews", "UserID": steam64_id}]

    except Exception as e:
        return [{"Error": str(e), "UserID": steam64_id}]

def process_csv(input_path):
    """Proses satu file CSV"""
    df = pd.read_csv(input_path)
    if 'Steam ID' not in df.columns:
        print(f"File {input_path} tidak memiliki kolom 'Steam ID'")
        return None
    
    all_reviews = []
    for steam_id in df['Steam ID'].dropna().unique():
        steam_id = str(steam_id).strip()
        if len(steam_id) != 17 or not steam_id.isdigit():
            print(f"ID tidak valid: {steam_id}")
            continue
            
        print(f"Memproses SteamID: {steam_id}")
        reviews = scrape_user_reviews(steam_id)
        if reviews:
            all_reviews.extend(reviews)
        time.sleep(DELAY)
    
    return pd.DataFrame(all_reviews) if all_reviews else None

def main():
    # Cari semua file CSV di folder input
    csv_files = glob.glob(os.path.join(INPUT_DIR, '*.csv'))
    
    if not csv_files:
        print(f"Tidak ditemukan file CSV di folder {INPUT_DIR}")
        return
    
    # Proses setiap file
    for input_file in csv_files:
        filename = os.path.basename(input_file)
        output_file = os.path.join(OUTPUT_DIR, f"reviews_{filename}")
        
        print(f"\nMemproses file: {filename}")
        result_df = process_csv(input_file)
        
        if result_df is not None:
            result_df.to_csv(output_file, index=False)
            print(f"✅ Data disimpan di: {output_file}")
        else:
            print(f"❌ Tidak ada data yang berhasil di-scrape dari {filename}")

if __name__ == "__main__":
    main()



Memproses file: processed_data_part_10.csv
Memproses SteamID: 76561197992939661


KeyboardInterrupt: 

# Code for All Pages

In [1]:
import os
import glob
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import random
import re

# Konfigurasi Folder
INPUT_DIR = 'Dataset'
OUTPUT_DIR = 'SteamReviews_2'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Konfigurasi Scraping
BASE_URL = "https://steamcommunity.com/profiles/"
HEADERS = {'User-Agent': UserAgent().random}
MIN_DELAY = 3  # Delay minimum (detik)
MAX_DELAY = 7  # Delay maksimum (detik)
MAX_PAGES = 50  # Batas maksimal halaman
MAX_ATTEMPTS = 3  # Maksimal percobaan per halaman

def get_random_delay():
    return random.uniform(MIN_DELAY, MAX_DELAY)

def scrape_user_reviews(steam64_id):
    all_reviews = []
    page = 1
    
    while page <= MAX_PAGES:
        url = f"{BASE_URL}{steam64_id}/recommended/?p={page}" if page > 1 else f"{BASE_URL}{steam64_id}/recommended/"
        attempt = 0
        success = False
        
        while attempt < MAX_ATTEMPTS and not success:
            try:
                # Random delay dan user agent
                delay = get_random_delay()
                time.sleep(delay)
                headers = {'User-Agent': UserAgent().random}
                
                response = requests.get(url, headers=headers, timeout=15)
                
                # Deteksi error page Steam
                if "Steam Community :: Error" in response.text:
                    print(f"[!] Steam error page detected for {steam64_id}")
                    return all_reviews if all_reviews else [{"Error": "Steam Error Page", "UserID": steam64_id}]
                
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Validasi review
                review_boxes = soup.find_all('div', class_='review_box')
                if not review_boxes:
                    if page == 1:
                        return [{"Status": "No reviews found", "UserID": steam64_id}]
                    return all_reviews
                
                # Ekstrak data review
                for review in review_boxes:
                    try:
                        game_elem = review.find('a', class_='hover_item_name')
                        rating_elem = review.find('div', class_='title')
                        content_elem = review.find('div', class_='content')
                        hours_elem = review.find('div', class_='hours')
                        date_elem = review.find('div', class_='postedDate')

                        all_reviews.append({
                            'UserID': steam64_id,
                            'Page': page,
                            'Game': game_elem.get_text(strip=True) if game_elem else "N/A",
                            'Rating': "Recommended" if rating_elem and 'Recommended' in rating_elem.text else "Not Recommended",
                            'Review': content_elem.get_text(strip=True, separator=' ') if content_elem else "N/A",
                            'Playtime': hours_elem.get_text(strip=True) if hours_elem else "N/A",
                            'PostedDate': date_elem.get_text(strip=True) if date_elem else "N/A"
                        })
                    except Exception as e:
                        print(f"Error parsing review: {str(e)}")
                        continue
                
                # Cek tombol next page
                next_btn = soup.find('a', class_='pagebtn', string='>')
                if not next_btn:
                    return all_reviews
                
                success = True
                page += 1
                
            except requests.exceptions.RequestException as e:
                attempt += 1
                print(f"Attempt {attempt} failed for page {page}: {str(e)}")
                if attempt >= MAX_ATTEMPTS:
                    print(f"Max attempts reached for {steam64_id}")
                    return all_reviews if all_reviews else [{"Error": "Max attempts reached", "UserID": steam64_id}]
                time.sleep(delay * 2)  # Tunggu lebih lama jika gagal
    
    print(f"Warning: Reached max page limit ({MAX_PAGES}) for {steam64_id}")
    return all_reviews

def process_csv(input_path):
    """Proses satu file CSV"""
    df = pd.read_csv(input_path)
    if 'Steam ID' not in df.columns:
        print(f"File {input_path} tidak memiliki kolom 'Steam ID'")
        return None
    
    all_reviews = []
    for steam_id in df['Steam ID'].dropna().unique():
        steam_id = str(steam_id).strip()
        if len(steam_id) != 17 or not steam_id.isdigit():
            print(f"ID tidak valid: {steam_id}")
            continue
            
        print(f"\nMemproses SteamID: {steam_id}")
        reviews = scrape_user_reviews(steam_id)
        if reviews:
            all_reviews.extend(reviews)
        
        # Delay antar user
        time.sleep(get_random_delay())
    
    return pd.DataFrame(all_reviews) if all_reviews else None

def main():
    # Cari semua file CSV di folder input
    csv_files = glob.glob(os.path.join(INPUT_DIR, '*.csv'))
    
    if not csv_files:
        print(f"Tidak ditemukan file CSV di folder {INPUT_DIR}")
        return
    
    # Proses setiap file
    for input_file in csv_files:
        filename = os.path.basename(input_file)
        output_file = os.path.join(OUTPUT_DIR, f"reviews_{filename}")
        
        print(f"\nMemproses file: {filename}")
        result_df = process_csv(input_file)
        
        if result_df is not None:
            # Simpan ke CSV
            result_df.to_csv(output_file, index=False)
            print(f"✅ Data disimpan di: {output_file}")
            
        else:
            print(f"❌ Tidak ada data yang berhasil di-scrape dari {filename}")

if __name__ == "__main__":
    main()



Memproses file: processed_data_part_10.csv

Memproses SteamID: 76561198881119093

Memproses SteamID: 76561198151246013

Memproses SteamID: 76561199023634809

Memproses SteamID: 76561198068123705

Memproses SteamID: 76561198146303292

Memproses SteamID: 76561198355865927

Memproses SteamID: 76561198095282532

Memproses SteamID: 76561198084472832

Memproses SteamID: 76561198070531704

Memproses SteamID: 76561198856374188

Memproses SteamID: 76561198011248114
✅ Data disimpan di: SteamReviews_2/reviews_processed_data_part_10.csv

Memproses file: processed_data_part_11.csv

Memproses SteamID: 76561199027285610

Memproses SteamID: 76561198961717161

Memproses SteamID: 76561198262826340

Memproses SteamID: 76561198116934331

Memproses SteamID: 76561198796747013

Memproses SteamID: 76561198245411354

Memproses SteamID: 76561199025463178

Memproses SteamID: 76561198356057139

Memproses SteamID: 76561198025679656
✅ Data disimpan di: SteamReviews_2/reviews_processed_data_part_11.csv

Memproses f