In [9]:
import csv
import sqlite3
import requests
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
import tempfile
from datetime import datetime
import pandas as pd
import re
import logging
import time
import random
from requests.exceptions import RequestException
import pytz
from tqdm import tqdm



# Charger les variables d'environnement
load_dotenv()

# Configuration du logging
logging.basicConfig(filename='scrap_populate.log',level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuration
GITHUB_REPO = 'steampage-creation-date'
CSV_FILE_PATH = 'steam_games.csv'
TIMESTAMP_FILE = 'tweet_each_day/timestamp_last_tweet.txt'
PARIS_TZ = pytz.timezone('Europe/Paris')
AUTHORIZED_TYPES = ["game", "dlc", 'demo', 'beta', '']

In [10]:
def init_ultimate_database():
    conn = sqlite3.connect('all-steampages-data.db')
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS steam_games (
        game_id INTEGER PRIMARY KEY,
        add_date INTEGER,
        type TEXT,
        dev TEXT,
        publisher TEXT,
        release_date INTEGER,
        description TEXT,
        nb_reviews INTEGER,
        free INTEGER,
        dlc INTEGER,
        dlc_list TEXT,
        price TEXT,
        metacritic INTEGER,
        genres TEXT,
        singleplayer INTEGER,
        multiplayer INTEGER,
        coop INTEGER,
        online_coop INTEGER,
        lan_coop INTEGER,
        shared_split_screen_coop INTEGER,
        shared_split_screen INTEGER,
        pvp INTEGER,
        lan_pvp INTEGER,
        shared_split_screen_pvp INTEGER,
        achievements INTEGER,
        full_controller_support INTEGER,
        trading_cards INTEGER,
        steam_cloud INTEGER,
        remote_play_phone INTEGER,
        remote_play_tablet INTEGER,
        remote_play_together INTEGER,
        remote_play_tv INTEGER,
        family_sharing INTEGER,
        captions_available INTEGER,
        inapp_purchases INTEGER,
        early_access INTEGER,
        vr_only INTEGER,
        vr_supported INTEGER,
        online_pvp INTEGER,
        required_age INTEGER,
        controller_support TEXT,
        categories TEXT,
        website TEXT,
        support_mail TEXT,
        support_url TEXT,
        cd_some_nudity_or_sexual_content INTEGER,
        cd_frequent_violence_gore INTEGER,
        cd_adult_only_sexual_content INTEGER,
        cd_frequent_nudity_or_sexual_content INTEGER,
        cd_general_mature_content INTEGER,
        lg_en INTEGER,
        lg_ger INTEGER,
        lg_spa INTEGER,
        lg_jap INTEGER,
        lg_portuguese INTEGER,
        lg_russian INTEGER,
        lg_simp_chin INTEGER,
        lg_trad_chin INTEGER,
        lg_fr INTEGER,
        lg_it INTEGER,
        lg_hung INTEGER,
        lg_kor INTEGER,
        lg_turk INTEGER,
        lg_arabic INTEGER,
        lg_polish INTEGER,
        lg_thai INTEGER,
        lg_viet INTEGER,
        ai_generated INTEGER,
        ai_content TEXT,
        tags TEXT,
        steam_x_handle TEXT 
    )
    ''')

    conn.commit()
    return conn


In [11]:
def get_game_ids_to_process(cursor):
    cursor.execute("""
    SELECT game_id FROM steam_games 
    WHERE tags IS NULL OR tags = ''
    """)
    return [row[0] for row in cursor.fetchall()]

In [12]:
# def insert_scrap_data(cursor: sqlite3.Cursor, scrap_data: dict):
#     # Extraction et transformation des données
    
#     ai_generated = scrap_data['ai_generated']
#     ai_content = scrap_data['ai_content']
#     tags = ','.join(scrap_data['tags']) 
#     steam_x_handle = scrap_data['x_handle']

    
#     # Insertion des données dans la base de données
#     cursor.execute('''
#     INSERT INTO steam_games (
#         ai_generated, ai_content, tags, steam_x_handle
#     ) VALUES (?, ?, ?, ?)
#     ''', (ai_generated, ai_content, tags, steam_x_handle
#     ))

#     return True

def update_scrap_data(cursor: sqlite3.Cursor, game_id: int, scrap_data: dict):
    # Extraction et transformation des données
    ai_generated = scrap_data['ai_generated']
    ai_content = scrap_data['ai_content']
    tags = ','.join(scrap_data['tags']) 
    steam_x_handle = scrap_data['x_handle']

    # Mise à jour des données dans la base de données
    cursor.execute('''
    UPDATE steam_games
    SET ai_generated = ?,
        ai_content = ?,
        tags = ?,
        steam_x_handle = ?
    WHERE game_id = ?
    ''', (ai_generated, ai_content, tags, steam_x_handle, game_id))

    return cursor.rowcount > 0  # Retourne True si une ligne a été mise à jour

In [13]:

def scrap_steam_page_info(app_id):
    url = f"https://store.steampowered.com/app/{app_id}/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Vérifier la présence de contenu généré par IA
        ai_disclosure = soup.find(string=re.compile("AI GENERATED CONTENT DISCLOSURE", re.IGNORECASE))
        
        # Recherche de la section "AI Generated Content Disclosure"
        ai_section = soup.find('h2', string='AI Generated Content Disclosure')
        ai_generated = bool(ai_section)
        ai_content = None
        if ai_generated:
            ai_paragraph = ai_section.find_next('i')
            if ai_paragraph:
                ai_content = ai_paragraph.text.strip()


        # Récupérer les tags
        tag_elements = soup.find_all('a', class_='app_tag')
        tags = [tag.text.strip() for tag in tag_elements]
        
        # Récupérer le lien Twitter s'il existe
        twitter_link = soup.find('a', class_="ttip", attrs={'data-tooltip-text': lambda x: x and 'x.com/' in x})
        x_handle = None
        if twitter_link:
            twitter_url = twitter_link['data-tooltip-text']
            x_handle = '@' + twitter_url.split('/')[-1]
        
        return {
            'ai_generated': bool(ai_disclosure),
            'ai_content': ai_content,
            'tags': tags,
            'x_handle': x_handle
        }
    except Exception as e:
        logging.error(f"Erreur lors du scraping pour le jeu {app_id}: {e}")
        return None

In [14]:
def main():
    logging.info("Début de l'exécution de main()")
    try:
        ultimate_conn = init_ultimate_database()
        cursor = ultimate_conn.cursor()

        try:
            excluded = load_excluded_ids()
        except Exception as e:
            logging.error(f"Erreur lors du chargement des IDs exclus : {str(e)}")
            excluded = set()  # Utiliser un ensemble vide en cas d'erreur
        
        games_to_process = get_game_ids_to_process(cursor)
        games_to_process = [game_id for game_id in games_to_process if game_id not in excluded]

        total_games = len(games_to_process)
        processed_games = 0
        newly_excluded = 0
        save_counter = 0

        for steam_game_id in tqdm(games_to_process):
            scrap_data = scrap_steam_page_info(steam_game_id)

            if scrap_data:
                if not scrap_data['tags']:
                    excluded.add(steam_game_id)
                    newly_excluded += 1
                    save_counter += 1
                    logging.info(f"Aucun tag scrappé pour {steam_game_id}. Ajouté à la liste excluded.")
                else:
                    try:
                        updated = update_scrap_data(cursor, steam_game_id, scrap_data)
                        if updated:
                            processed_games += 1
                            save_counter += 1
                        else:
                            logging.warning(f"Aucune mise à jour effectuée pour le jeu avec Steam ID {steam_game_id}")
                    except Exception as e:
                        logging.error(f"Erreur lors de la mise à jour de l'id_data {steam_game_id} dans ultimate db: {str(e)}")
            else:
                logging.info(f"Impossible de scrapper les détails pour le jeu avec Steam ID {steam_game_id}")
            
            # Commit et sauvegarde périodiques
            if save_counter >= 500:
                ultimate_conn.commit()
                save_excluded_ids(excluded)
                logging.info(f"Commit et sauvegarde effectués après {processed_games + newly_excluded} jeux traités.")
                save_counter = 0
            
            time.sleep(random.uniform(0.7, 1.4))

        # Commit final et sauvegarde finale des IDs exclus
        ultimate_conn.commit()
        save_excluded_ids(excluded)

        # Nettoyage
        ultimate_conn.close()

        logging.info(f"\nRésumé : {processed_games} jeux mis à jour sur {total_games} jeux à traiter.")
        logging.info(f"{newly_excluded} nouveaux jeux exclus (sans tags).")
        logging.info(f"Total des jeux exclus : {len(excluded)}")

        return total_games, processed_games, newly_excluded, len(excluded)


    except Exception as e:
        logging.exception(f"Une erreur inattendue s'est produite dans main(): {str(e)}")
    finally:
        # Assurez-vous que la connexion est fermée même en cas d'erreur
        if 'ultimate_conn' in locals():
            ultimate_conn.close()
        # Sauvegardez les IDs exclus même en cas d'erreur
        if 'excluded' in locals():
            try:
                save_excluded_ids(excluded)
            except Exception as e:
                logging.error(f"Erreur lors de la sauvegarde des IDs exclus : {str(e)}")
    
    return None  



In [15]:

def handle_excluded_game_ids(filename='excluded_game_ids.txt'):
    def load_excluded_ids():
        excluded = set()
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:  # Ignorer les lignes vides
                        try:
                            excluded.add(int(line))
                        except ValueError:
                            logging.warning(f"Valeur non valide ignorée dans {filename}: {line}")
        return excluded

    def save_excluded_ids(excluded_ids):
        with open(filename, 'w') as f:
            for game_id in excluded_ids:
                f.write(f"{game_id}\n")

    return load_excluded_ids, save_excluded_ids

load_excluded_ids, save_excluded_ids = handle_excluded_game_ids()

In [16]:

if __name__ == "__main__":
    try:
        result = main()
        if result is not None:
            total_games, processed_games, newly_excluded, total_excluded = result
            print(f"\nRésumé : {processed_games} jeux scrappés sur {total_games} jeux à scrapper.")
            print(f"Total des jeux exclus : {total_excluded}")
        else:
            logging.error("La fonction main() a retourné None")
            print("Une erreur s'est produite lors de l'exécution. Veuillez consulter le fichier de log pour plus de détails.")

    except Exception as e:
        logging.exception(f"Une erreur s'est produite lors de l'exécution : {str(e)}")
        print(f"Une erreur s'est produite. Veuillez consulter le fichier de log pour plus de détails.")

 20%|█▉        | 32011/164128 [15:59:55<66:01:48,  1.80s/it] 


KeyboardInterrupt: 