### Deps

In [None]:
import re
import requests
import random
import time
import pandas as pd
import csv
from bs4 import BeautifulSoup

### Extração dos links

In [None]:
def sleep_random(min_ms=10, max_ms=2000):
    tempo_ms = random.randint(min_ms, max_ms)
    time.sleep(tempo_ms / 1000)

In [None]:
def get_soup_nav(page) -> BeautifulSoup:
        url = "https://www.9minecraft.net/category/minecraft-mods/page/{}/".format(page)
        response = requests.get(url)

        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
        
        print("Failed to retrieve page {}: {}".format(page, response.status_code))
        return None

In [None]:
def get_links(soup: BeautifulSoup) -> list:
    filtered_links = []
    for div in soup.find_all('div', class_='thumbindex'):
        a_tag = div.find('a', href=True)
        if a_tag and a_tag['href'].startswith('https://www.9minecraft.net/'):
            filtered_links.append(a_tag['href'])
    return filtered_links

In [None]:
def get_all_links() -> list:
    page = 1
    all_links = []

    while True:
        sleep_random()

        soup = get_soup_nav(page)
        if soup is None:
            break

        filtered_links = get_links(soup)

        if not filtered_links:
            print("No more links found on page {}".format(page))
            continue
        else:
            print("Found {} links on page {}".format(len(filtered_links), page))
        
        all_links.extend(filtered_links)

        if page == 2:
            print("Stopping after 2 pages.")
            break

        page += 1
    return all_links

### Extração dos meta-dados

In [None]:
def get_soup_mod(link) -> BeautifulSoup:
    url = link
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    
    print("Failed to retrieve mod page: {}, link {}".format(response.status_code, link))
    return None

In [None]:
def clean_line_breaks(text):
    if text:
        return text.replace('\n', ' ').replace('\r', ' ').strip()
    return ""

In [None]:
def get_headers(soup: BeautifulSoup) -> dict:
    h1_tag = soup.find('h1')
    return h1_tag.text.strip() if h1_tag else None

In [None]:
def get_description(soup: BeautifulSoup) -> dict:
    description = soup.find('div', class_='postContent')
    return clean_line_breaks(description.text.strip() if description else None)

In [None]:
def get_author(soup: BeautifulSoup) -> dict:
    external_author = soup.find('span', class_='post-author')
    author = external_author.find('span').text.strip() if external_author else ""
    return author if author else None

In [None]:
def get_date_published(soup: BeautifulSoup) -> dict:
    date_published = soup.find('span', class_='post-time')
    return date_published.text.strip() if date_published else None

In [None]:
def get_views(soup: BeautifulSoup) -> dict:
    views = soup.find('span',class_='post_view')
    return views.text.strip() if views else None

In [None]:
def get_version(name):
    version = re.search(r'\((.*?)\)', name.strip())
    # If found, store the version; otherwise, return None
    if version:
        version = version.group(1)
    else:
        version = None
    return version.strip() if version else None

In [None]:
def get_category(soup: BeautifulSoup) -> dict:
    breadcrumb = soup.find('p', id='breadcrumbs')
    category = breadcrumb.find_all('a')[1].text
    return category.strip() if category else None

In [None]:
def get_tags(soup: BeautifulSoup) -> dict:
    tags_h3 = soup.find('h3', string='Tags:')
    posttags_div = tags_h3.find_next('div', class_='posttags')
    tags = [a.text for a in posttags_div.find_all('a')]
    return tags if tags else None

In [None]:
def get_modloader(soup: BeautifulSoup) -> dict:
    mod_type_span = soup.find('span', class_='post-mod-type')
    if mod_type_span:
        links = mod_type_span.find_all('a')
        mod_types = [a.text for a in links]
        return mod_types if mod_types else None
    return None

In [None]:
def get_amount_updates(soup: BeautifulSoup) -> dict:
    h2 = soup.find('h2', string=lambda text: text and 'Download Links' in text)
    stop_div = soup.find('div', class_='yasr-auto-insert-visitor') # Pega o div de parada

    current = h2 # Coleta tudo entre o h2 e o div de parada
    h4_count = 0

    while current and current != stop_div:
        current = current.find_next()
        if current.name == 'h4':
            h4_count += 1
    return h4_count if h4_count else None

In [None]:
def get_dependencies(soup: BeautifulSoup) -> dict:
    requires = soup.find('h3', string='Requires:')
    blockquote = requires.find_next('blockquote')
    links = blockquote.find_all('a')
    dependencias = set()

    for link in links:
        dependencias.add(link.text)
    
    return dependencias if dependencias else None

In [None]:
def get_link(soup: BeautifulSoup) -> dict:
    link_canonical = soup.find('link', rel='canonical')
    link = link_canonical['href'] if link_canonical else ""
    return link.strip() if link else None

In [None]:
def get_data_mod(soup: BeautifulSoup) -> dict:
    data = {'name': '',
            'description': '',
            'author': '',
            'date_published': '',
            'date_updated': 0,
            'downloads': 0,
            'views': '',
            'version': '',
            'category': '',
            'tags': [],
            'modloader': '',
            'amount_updates': '',
            'size': 0,
            'dependencies': [],
            'link': [],
            'source':'9minecraft.net'}
    
    data['name'] = get_headers(soup)
    data['description'] = get_description(soup)
    data['author'] = get_author(soup)
    data['date_published'] = get_date_published(soup)
    data['views'] = get_views(soup)
    data['version'] = get_version(data['name'])
    data['category'] = get_category(soup)
    data['tags'] = get_tags(soup)
    data['modloader'] = get_modloader(soup)
    data['amount_updates'] = get_amount_updates(soup)
    data['dependencies'] = get_dependencies(soup)
    data['link'] = get_link(soup)

    #print("Name: {}".format(data['name']))
    #print("Description: {}".format(data['description']))
    #print("Author: {}".format(data['author']))
    #print("Date Published: {}".format(data['date_published']))
    #print("Version: {}".format(data['version']))
    #print("Category: {}".format(data['category']))
    #print("Tags: {}".format(data['tags']))
    #print("Modloaders: {}".format(data['modloader']))
    #print("Amount Updates: {}".format(data['amount_updates']))
    #print("Dependencies: {}".format(data['dependencies']))
    #print("Link: {}".format(data['link']))
    return data

In [None]:
def get_data_9_minecraft() -> pd.DataFrame:
    links = get_all_links()
    mods = []
    for link in links:
        sleep_random(0, 500)
        soup = get_soup_mod(link)
        if soup is None:
            continue

        data = get_data_mod(soup)
        if data is None:
            continue
        mods.append(data)
    
    df = pd.DataFrame(mods)
    return df

In [None]:
df = get_data_9_minecraft()

df.to_csv('extract_9minecraft.csv', index=False, encoding='utf-8')