### Importar librerias

In [35]:
# Standard library imports
import time

# Third-party imports
import pandas as pd

# Selenium imports
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# WebDriver manager
from webdriver_manager.chrome import ChromeDriverManager


### Test scrapper

In [36]:
def test_scrape_yogonet():
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("--start-maximized")
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=options)
    
    try:
        # Go directly to the latest news page
        print("Accessing latest news page...")
        driver.get("https://www.yogonet.com/international/latest-news/")
        
        # Wait for the news container to load
        wait = WebDriverWait(driver, 10)
        news_container = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "contenedor_modulo"))
        )
        
        # Find all article elements
        articles = driver.find_elements(By.CSS_SELECTOR, "div.contenedor_dato_modulo")
        
        print(f"\nFound {len(articles)} articles")
        if articles:
            # Print details of first article for debugging
            first_article = articles[0]
            print("\nFirst article details:")
            print(first_article.text)
            
            # Check for specific elements within articles
            for article in articles[:3]:  # Look at first 3 articles
                try:
                    title = article.find_element(By.CSS_SELECTOR, "h2.titulo").text
                    print(f"\nArticle title: {title}")
                except:
                    print("\nCouldn't find title element")
                    
        return articles
        
    finally:
        driver.quit()

# Execute the test
articles = test_scrape_yogonet()

Accessing latest news page...

Found 12 articles

First article details:
Soft2Bet wins Best iGaming Solutions provider category in Romania

Article title: Soft2Bet wins Best iGaming Solutions provider category in Romania

Article title: Skilrock-sponsored LatAm gaming industry webinar successfully concludes

Article title: DRGT gears up for ICE 2025 Barcelona with range of player-centric solutions


In [40]:
articles_df = pd.DataFrame(articles)
articles_df.head()

Unnamed: 0,0
0,<selenium.webdriver.remote.webelement.WebEleme...
1,<selenium.webdriver.remote.webelement.WebEleme...
2,<selenium.webdriver.remote.webelement.WebEleme...
3,<selenium.webdriver.remote.webelement.WebEleme...
4,<selenium.webdriver.remote.webelement.WebEleme...


### Other version that inserts into pandas

In [41]:
import pandas as pd
from datetime import datetime

def test_scrape_yogonet():
    options = Options()
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("--start-maximized")
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    
    driver = webdriver.Chrome(options=options)
    
    try:
        print("Accessing latest news page...")
        driver.get("https://www.yogonet.com/international/latest-news/")
        
        wait = WebDriverWait(driver, 10)
        news_container = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "contenedor_modulo"))
        )
        
        articles = driver.find_elements(By.CSS_SELECTOR, "div.contenedor_dato_modulo")
        print(f"\nFound {len(articles)} articles")
        
        # Create lists to store article data
        article_data = []
        
        for article in articles:
            try:
                # Extract article information
                title = article.find_element(By.CSS_SELECTOR, "h2.titulo a").text
                link = article.find_element(By.CSS_SELECTOR, "h2.titulo a").get_attribute("href")
                
                try:
                    category = article.find_element(By.CSS_SELECTOR, "div.volanta").text
                except:
                    category = "N/A"
                    
                try:
                    image = article.find_element(By.CSS_SELECTOR, "img").get_attribute("src")
                except:
                    image = "N/A"
                
                # Add data to list
                article_data.append({
                    'title': title,
                    'category': category,
                    'link': link,
                    'image': image,
                    'scraped_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                
            except Exception as e:
                print(f"Error processing article: {str(e)}")
                continue
        
        # Create DataFrame
        df = pd.DataFrame(article_data)
        print("\nDataFrame created successfully!")
        print("\nFirst few rows of the DataFrame:")
        print(df.head())
        
        return df
        
    finally:
        driver.quit()

# Execute the test and store the DataFrame
df = test_scrape_yogonet()

# Optional: Save to CSV
# df.to_csv('yogonet_articles.csv', index=False)

Accessing latest news page...

Found 12 articles

DataFrame created successfully!

First few rows of the DataFrame:
                                               title category  \
0                                                         N/A   
1  FanDuel Casino sponsors free tolls and rides i...      N/A   
2  Spectrum Gaming Group unveils list of top 10 U...      N/A   
3  Soft2Bet wins Best iGaming Solutions provider ...      N/A   
4  Caesars Virginia sets December 17 for grand op...      N/A   

                                                link  \
0  https://www.yogonet.com/international/news/202...   
1  https://www.yogonet.com/international/news/202...   
2  https://www.yogonet.com/international/news/202...   
3  https://www.yogonet.com/international/news/202...   
4  https://www.yogonet.com/international/news/202...   

                                               image         scraped_date  
0  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:36:49  
1  h

In [42]:
df.head()

Unnamed: 0,title,category,link,image,scraped_date
0,,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49
1,FanDuel Casino sponsors free tolls and rides i...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49
2,Spectrum Gaming Group unveils list of top 10 U...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49
3,Soft2Bet wins Best iGaming Solutions provider ...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49
4,Caesars Virginia sets December 17 for grand op...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49


### Test procesamiento datos

In [43]:
def test_process_data(scraped_data):
    """
    Test version of process_scraped_data() with data validation
    """
    print("\nTesting data processing...")
    
    # Create DataFrame
    #df = pd.DataFrame(scraped_data)
    
    # Print basic information
    print("\nDataFrame Info:")
    print(df.info())
    
    print("\nNull Values Check:")
    print(df.isnull().sum())
    
    # Add processing columns
    df["word_count"] = df["title"].apply(lambda x: len(x.split()))
    df["char_count"] = df["title"].apply(len)
    df["capital_words"] = df["title"].apply(lambda x: [word for word in x.split() if word.istitle()])
    
    # Print sample statistics
    print("\nSample Statistics:")
    print(f"Average word count: {df['word_count'].mean():.2f}")
    print(f"Average character count: {df['char_count'].mean():.2f}")
    
    return df

### Celdas para ejecutar pruebas

In [48]:
scraped_data = test_scrape_yogonet()
print(f"\nTotal articles scraped: {len(scraped_data)}")
print(scraped_data.head())


Accessing latest news page...

Found 12 articles

DataFrame created successfully!

First few rows of the DataFrame:
  title category                                               link  \
0            N/A  https://www.yogonet.com/international/news/202...   
1            N/A  https://www.yogonet.com/international/news/202...   
2            N/A  https://www.yogonet.com/international/news/202...   
3            N/A  https://www.yogonet.com/international/news/202...   
4            N/A  https://www.yogonet.com/international/news/202...   

                                               image         scraped_date  
0  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:40:46  
1  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:40:46  
2  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:40:46  
3  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:40:47  
4  https://imagenesyogonet.b-cdn.net/data/imagene...  2024-12-09 16:40:47  



In [49]:

processed_df = test_process_data(scraped_data)
print("\nFirst few rows of processed data:")
display(processed_df.head())


Testing data processing...

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          12 non-null     object
 1   category       12 non-null     object
 2   link           12 non-null     object
 3   image          12 non-null     object
 4   scraped_date   12 non-null     object
 5   word_count     12 non-null     int64 
 6   char_count     12 non-null     int64 
 7   capital_words  12 non-null     object
dtypes: int64(2), object(6)
memory usage: 900.0+ bytes
None

Null Values Check:
title            0
category         0
link             0
image            0
scraped_date     0
word_count       0
char_count       0
capital_words    0
dtype: int64

Sample Statistics:
Average word count: 12.92
Average character count: 85.75

First few rows of processed data:


Unnamed: 0,title,category,link,image,scraped_date,word_count,char_count,capital_words
0,,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49,0,0,[]
1,FanDuel Casino sponsors free tolls and rides i...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49,12,75,"[Casino, New, Jersey, Pennsylvania]"
2,Spectrum Gaming Group unveils list of top 10 U...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49,17,88,"[Spectrum, Gaming, Group]"
3,Soft2Bet wins Best iGaming Solutions provider ...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49,9,65,"[Soft2Bet, Best, Solutions, Romania]"
4,Caesars Virginia sets December 17 for grand op...,,https://www.yogonet.com/international/news/202...,https://imagenesyogonet.b-cdn.net/data/imagene...,2024-12-09 16:36:49,15,101,"[Caesars, Virginia, December, Dennis, Rodman]"
