### Deps

In [2]:
import re
import requests
import random
import time
import pandas as pd
import csv
import os
from bs4 import BeautifulSoup

In [3]:
start = True

### Extração dos links

In [4]:
def sleep_random(min_ms=10, max_ms=2000):
    tempo_ms = random.randint(min_ms, max_ms)
    time.sleep(tempo_ms / 1000)

In [5]:
def get_soup_nav(url: str, page: int) -> BeautifulSoup:
    # Monta o URL dinâmico
    full_url = "{}page/{}/".format(url, page)
    response = requests.get(full_url)

    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    
    print("Failed to retrieve page {}: {}".format(page, response.status_code))
    return None

In [6]:
def get_links(soup: BeautifulSoup) -> list:
    filtered_links = []
    for div in soup.find_all('div', class_='thumbindex'):
        a_tag = div.find('a', href=True)
        if a_tag and a_tag['href'].startswith('https://www.9minecraft.net/'):
            filtered_links.append(a_tag['href'])
    return filtered_links

In [7]:
def get_all_links(urls: list) -> list:
    all_links = []
    header_written = False
    
    for url in urls:
        page = 1
        
        while True:
            sleep_random()
            
            soup = get_soup_nav(url, page)
            if soup is None:
                break
            
            filtered_links = get_links(soup)
            
            if not filtered_links:
                print("No more links found on page {}".format(page))
                continue
            else:
                print("Found {} links on page {}".format(len(filtered_links), page))
            
            all_links.extend(filtered_links)

            #df = pd.DataFrame(filtered_links, columns=["Links"])
            #df.to_csv("9minecraft.csv", index=False, mode='a', header=not bool(page-1) and start and header_written)

            file_exists = os.path.exists("9minecraft.csv")

            # Definir o cabeçalho para a primeira execução, não repetindo nas subsequentes
            header = not file_exists and start
            
            # Escrever os links no CSV, mas apenas escrever o cabeçalho na primeira vez
            df = pd.DataFrame(filtered_links, columns=["Links"])
            df.to_csv("9minecraft.csv", index=False, mode='a', header=header and start)

            #header_written = True

            #if page == 2:
            #    print("Stopping after 2 pages.")
            #    break
            
            # Você pode configurar para parar depois de um número específico de páginas, se necessário.
            page += 1
    
    return all_links

### Extração dos meta-dados

In [8]:
def get_soup_mod(link) -> BeautifulSoup:
    url = link
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    
    print("Failed to retrieve mod page: {}, link {}".format(response.status_code, link))
    return None

In [9]:
def clean_line_breaks(text):
    if text:
        return text.replace('\n', ' ').replace('\r', ' ').strip()
    return ""

In [10]:
def get_headers(soup: BeautifulSoup) -> dict:
    h1_tag = soup.find('h1')
    return h1_tag.text.strip() if h1_tag else None

In [11]:
def get_description(soup: BeautifulSoup) -> dict:
    description = soup.find('div', class_='postContent')
    return clean_line_breaks(description.text.strip() if description else None)

In [12]:
def get_author(soup: BeautifulSoup) -> dict:
    external_author = soup.find('span', class_='post-author')
    if external_author:
        author_span = external_author.find('span')
        if author_span:
            return author_span.text.strip() or None
    return None

In [13]:
def get_date_published(soup: BeautifulSoup) -> dict:
    date_published = soup.find('span', class_='post-time')
    return date_published.text.strip() if date_published else None

In [14]:
def get_views(soup: BeautifulSoup) -> dict:
    views = soup.find('span',class_='post_view')
    return views.text.strip() if views else None

In [15]:
def get_version(name):
    version = re.search(r'\((.*?)\)', name.strip())
    # If found, store the version; otherwise, return None
    if version:
        version = version.group(1)
        return version.strip() if version else None
    return None

In [16]:
def get_category(soup: BeautifulSoup) -> dict:
    breadcrumb = soup.find('p', id='breadcrumbs')
    if breadcrumb:
        links = breadcrumb.find_all('a')
        if len(links) > 1:
            category = links[1].text.strip()
            return category if category else None
    return None

In [17]:
def get_tags(soup: BeautifulSoup) -> dict:
    tags_h3 = soup.find('h3', string='Tags:')
    if tags_h3:
        posttags_div = tags_h3.find_next('div', class_='posttags')
        if posttags_div:
            tags = [a.text for a in posttags_div.find_all('a')]
            return tags if tags else None
    return None

In [18]:
def get_modloader(soup: BeautifulSoup) -> dict:
    mod_type_span = soup.find('span', class_='post-mod-type')
    if mod_type_span:
        links = mod_type_span.find_all('a')
        mod_types = [a.text for a in links]
        return mod_types if mod_types else None
    return None

In [19]:
def get_amount_updates(soup: BeautifulSoup) -> dict:
    h2 = soup.find('h2', string=lambda text: text and 'Download Links' in text)
    stop_div = soup.find('div', class_='yasr-auto-insert-visitor')

    if not h2 or not stop_div:
        return None

    current = h2  # Coleta tudo entre o h2 e o div de parada
    h4_count = 0

    while current and current != stop_div:
        current = current.find_next()
        if current.name == 'h4':
            h4_count += 1

    return h4_count if h4_count else None

In [20]:
def get_dependencies(soup: BeautifulSoup) -> dict:
    requires = soup.find('h3', string='Requires:')
    if requires:
        blockquote = requires.find_next('blockquote')
        if blockquote:
            links = blockquote.find_all('a')
            dependencias = {link.text for link in links}
            return dependencias if dependencias else None
    return None

In [21]:
def get_link(soup: BeautifulSoup) -> dict:
    link_canonical = soup.find('link', rel='canonical')
    if link_canonical and 'href' in link_canonical.attrs:
        return link_canonical['href'].strip() or None
    return None

In [22]:
def get_data_mod(soup: BeautifulSoup) -> dict:
    data = {'name': '',
            'description': '',
            'author': '',
            'date_published': '',
            'date_updated': 0,
            'downloads': 0,
            'views': '',
            'version': '',
            'category': '',
            'tags': [],
            'modloader': '',
            'amount_updates': '',
            'size': 0,
            'dependencies': [],
            'link': [],
            'source':'9minecraft.net'}
    
    data['name'] = get_headers(soup)
    data['description'] = get_description(soup)
    data['author'] = get_author(soup)
    data['date_published'] = get_date_published(soup)
    data['views'] = get_views(soup)
    data['version'] = get_version(data['name'])
    data['category'] = get_category(soup)
    data['tags'] = get_tags(soup)
    data['modloader'] = get_modloader(soup)
    data['amount_updates'] = get_amount_updates(soup)
    data['dependencies'] = get_dependencies(soup)
    data['link'] = get_link(soup)

    print("Name: {}".format(data['name']))
    #print("Description: {}".format(data['description']))
    #print("Author: {}".format(data['author']))
    #print("Date Published: {}".format(data['date_published']))
    #print("Version: {}".format(data['version']))
    #print("Category: {}".format(data['category']))
    #print("Tags: {}".format(data['tags']))
    #print("Modloaders: {}".format(data['modloader']))
    #print("Amount Updates: {}".format(data['amount_updates']))
    #print("Dependencies: {}".format(data['dependencies']))
    #print("Link: {}".format(data['link']))
    return data

In [23]:
def write_csv(data: dict, header: bool = False):
    df = pd.DataFrame(data)
    df.to_csv('9minecraft_mods.csv', mode='a', index=False, header=header)
    print("Data written to CSV file.")

In [24]:
def get_data_9_minecraft(links):
    #links = get_all_links(urls)
    mods = []
    h = True
    
    for link in links:
        sleep_random(0, 500)
        soup = get_soup_mod(link)
        if soup is None:
            continue
        
        data = get_data_mod(soup)
        if data is None:
            continue
        mods.append(data)

        if len(mods) % 15 == 0:
            write_csv(mods, header=h and start)
            h = False
            mods = []
    
    if mods:
        write_csv(mods, header=h)
        mods = []
    print("All data written to CSV file.")
    
    #df = pd.DataFrame(mods)
    #return df

### Extração

In [25]:
urls = ['https://www.9minecraft.net/category/minecraft-mods/',
        'https://www.9minecraft.net/category/minecraft-modpacks/',
        'https://www.9minecraft.net/category/minecraft-resource-packs/',
        'https://www.9minecraft.net/tag/minecraft-shaders/',
        'https://www.9minecraft.net/category/minecraft-maps/',
        'https://www.9minecraft.net/category/minecraft-pe/',
        'https://www.9minecraft.net/category/minecraft-launchers/',
        'https://www.9minecraft.net/category/minecraft-clients/',
        'https://www.9minecraft.net/category/minecraft-data-packs/',
        'https://www.9minecraft.net/category/minecraft-tutorials/',
        'https://www.9minecraft.net/category/minecraft-seeds/',
        'https://www.9minecraft.net/category/minecraft-plugins/',
        'https://www.9minecraft.net/category/command-blocks/',
        'https://www.9minecraft.net/category/minecraft-skins/']

_ = get_all_links(urls)

Found 15 links on page 1
Found 15 links on page 2
Found 15 links on page 3
Found 15 links on page 4
Found 15 links on page 5
Found 15 links on page 6
Found 15 links on page 7
Found 15 links on page 8
Found 15 links on page 9
Found 15 links on page 10
Found 15 links on page 11
Found 15 links on page 12
Found 15 links on page 13
Found 15 links on page 14
Found 15 links on page 15
Found 15 links on page 16
Found 15 links on page 17
Found 15 links on page 18
Found 15 links on page 19
Found 15 links on page 20
Found 15 links on page 21
Found 15 links on page 22
Found 15 links on page 23
Found 15 links on page 24
Found 15 links on page 25
Found 15 links on page 26
Found 15 links on page 27
Found 15 links on page 28
Found 15 links on page 29
Found 15 links on page 30
Found 15 links on page 31
Found 15 links on page 32
Found 15 links on page 33
Found 15 links on page 34
Found 15 links on page 35
Found 15 links on page 36
Found 15 links on page 37
Found 15 links on page 38
Found 15 links on pag

In [26]:
df = pd.read_csv('9minecraft.csv')
links_9_minecraft = df['Links'].tolist()

In [27]:
get_data_9_minecraft(links_9_minecraft)

Name: Camera Obscura Mod (1.21.5, 1.20.1) – Take Photos of Your World
Name: Toms Mobs Mod (1.21.5, 1.20.1) – New Mobs, More Adventures
Name: Konkrete Mod (1.21.5, 1.20.1) – Library for Keksuccino’s Mods
Name: Item Counts Mod (1.21.5, 1.20.1) – Display Total Item Counts
Name: Trajectory Preview Mod (1.21.5, 1.20.1) – Never Miss A Shot Again!
Name: Filament Mod (1.21.5, 1.20.1) – Add New Items, Blocks, and Decorations
Name: Cull Clouds Mod (1.19.2, 1.18.2) – Optimizes Cloud Rendering for Improved FPS
Name: The God Mod (1.20.1, 1.19.4) – A Powerful and Observing Entity in The Sky
Name: Just Zoom Mod (1.21.5, 1.20.1) – Enhance Zoom with Advanced Customization
Name: G4mespeed Mod (1.21.5, 1.20.1) – Enhanced Reliability, Enhanced Performance
Name: Useful Hats Mod (1.21.5, 1.20.1) – Utiltity Headwears
Name: Clear Water Mod (1.21.5, 1.20.1) – Removes The Fog Underwater
Name: UUID Command Mod (1.21.5, 1.20.6) – Compact Your Redstone Circuitry
Name: Unoriginal Mod (1.21.3, 1.20.6) – Removes All 

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))