### Deps

In [None]:
import re
import requests
import random
import time
import pandas as pd
from bs4 import BeautifulSoup

### Extração dos links

In [None]:
def sleep_random(min_ms=10, max_ms=2000):
    tempo_ms = random.randint(min_ms, max_ms)
    time.sleep(tempo_ms / 1000)

In [None]:
def get_soup_nav(page) -> BeautifulSoup:
        url = "https://mcreator.net/modifications?page={}#google_vignette".format(page)
        response = requests.get(url)

        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
        
        print("Failed to retrieve page {}: {}".format(page, response.status_code))
        return None

In [None]:
def get_links(soup: BeautifulSoup) -> list:
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            links.append(href)
    return links

In [None]:
def get_all_links() -> list:
    page = 1
    all_links = []

    while True:
        sleep_random()

        soup = get_soup_nav(page)

        if soup is None:
            break

        links = get_links(soup)
        filtered_links = None
        if links:
            pattern = re.compile(r'.*modification/\d+/.*', re.IGNORECASE)
            filtered_links = [link for link in links if pattern.match(link)]
        else:
            print("No links found on page {}".format(page))

        if not filtered_links:
            print("No more links found on page {}".format(page))
        else:
            print("Found {} links on page {}".format(len(filtered_links), page))
            all_links.extend(filtered_links)

        if page == 1:
            print("Stopping after 2 pages.")
            break
        page += 1
    return all_links

### Extração dos meta-dados

In [None]:
def get_soup_mod(link) -> BeautifulSoup:
    url = "https://mcreator.net" + link
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    
    print("Failed to retrieve mod page: {}, link {}".format(response.status_code, link))
    return None

#### Funções de extração de atributos

In [None]:
def get_name(soup: BeautifulSoup) -> str:
    h1_tag = soup.find('h1')
    return h1_tag.text.strip() if h1_tag else None

In [None]:
def get_description(soup: BeautifulSoup) -> str:
    div = soup.find('div', class_=lambda x: x and 'field--name-body' in x)
    text = div.get_text(strip=True) if div else ''
    return text

In [None]:
def get_author_and_date_published(soup: BeautifulSoup) -> tuple:
    footer = soup.find('footer')
    if not footer:
        return ('', '')
    
    author_tag = footer.find('a')
    time_tag = footer.find('time')

    author = author_tag['href'] if author_tag and author_tag.has_attr('href') else ''
    date_published = time_tag['datetime'] if time_tag and time_tag.has_attr('datetime') else ''

    return (author, date_published)

In [None]:
def get_date_updated_and_size(soup: BeautifulSoup) -> tuple:
    updates_section = soup.find('div', class_=lambda x: x and 'field--name-field-modification-file' in x)
    if not updates_section:
        return ('', 0)
    
    first_item = updates_section.find('div', class_='field--item')
    if not first_item:
        return ('', 0)
    
    file_size_tag = first_item.find('span', class_='file-size')
    upload_date = ''
    file_size = 0
    
    if file_size_tag:
        text = file_size_tag.text.strip()
        if 'Uploaded on:' in text and 'File size:' in text:
            parts = text.split('File size:')
            upload_date = parts[0].replace('Uploaded on:', '').strip()
            file_size = parts[1].strip()
    
    return (upload_date, file_size)

In [None]:
def get_downloads(soup: BeautifulSoup) -> int:
    downloads_div = soup.find('div', class_=lambda x: x and 'field--name-field-download-count' in x)
    if not downloads_div:
        return 0

    count_div = downloads_div.find('div', class_='field--item')
    
    if count_div:
        try:
            return int(count_div.text.strip())
        except ValueError:
            return 0
    return 0

In [None]:
def get_version(soup: BeautifulSoup) -> str:
    version_div = soup.find('div', class_=lambda x: x and 'field--name-field-minecraft-version' in x)
    if not version_div:
        return ''
    
    item_div = version_div.find('div', class_='field--item')
    return item_div.text.strip() if item_div else ''


In [None]:
def get_category(soup: BeautifulSoup) -> str:
    return 'mod'


In [None]:
def get_tags(soup: BeautifulSoup) -> list:
    category_div = soup.find('div', class_=lambda x: x and 'field--name-field-category' in x)
    if not category_div:
        return ''
    
    item_div = category_div.find('div', class_='field--item')
    if not item_div:
        return ''
    
    link = item_div.find('a')
    return link.text.strip() if link else ''

In [None]:
def get_modloader(soup: BeautifulSoup) -> str:
    mod_type_div = soup.find('div', class_=lambda x: x and 'field--name-field-modification-type' in x)
    if not mod_type_div:
        return ''
    
    item_div = mod_type_div.find('div', class_='field--item')
    link = item_div.find('a') if item_div else None
    return link.text.strip() if link else ''

In [None]:
def get_data_mod(soup: BeautifulSoup) -> dict:
    data = {'name': '',
            'description': '',
            'author': '',
            'date_published': '',
            'date_updated': '',
            'downloads': 0,
            'views': 0,
            'version': '',
            'category': '',
            'tags': '',
            'modloader': '',
            'amount_updates': 0,
            'size': '',
            'dependencies': '',
            'link': '',
            'source':''}
    
    data['name'] = get_name(soup)
    data['description'] = get_description(soup)

    author, date_published = get_author_and_date_published(soup)
    data['author'] = author
    data['date_published'] = date_published
    
    date_updated, size = get_date_updated_and_size(soup)
    data['date_updated'] = date_updated
    data['size'] = size

    data['downloads'] = get_downloads(soup)
    
    data['version'] = get_version(soup)
    data['category'] = get_category(soup)
    data['tags'] = get_tags(soup)
    data['modloader'] = get_modloader(soup)

    data['source'] = 'mcreator.net'

    return data


In [None]:
def get_data_mcreator_net(links) -> pd.DataFrame:
    mods = []
    for link in links:
        sleep_random(0, 500)
        
        soup = get_soup_mod(link)
        if soup is None:
            continue

        data = get_data_mod(soup)
        if data is None:
            continue
        mods.append(data)
    
    df = pd.DataFrame(mods)
    return df

### Extração

In [None]:
links_mcreator_net = get_all_links()

In [None]:
df = get_data_mcreator_net(links_mcreator_net)