In [None]:
import os
import pandas as pd
import kagglehub
from tqdm import tqdm
import numpy as np
import zipfile
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import gc
from pathlib import Path
from collections import defaultdict

from data_loading.tools import reduce_mem_usage

In [None]:
# Base URL of the dataset
BASE_URL = "http://malnet.cc.gatech.edu/image-data/"
SAVE_DIR = "data/MalNet_Dataset"

def ensure_directory(path):
    """Ensure the directory exists."""
    if not os.path.exists(path):
        os.makedirs(path)

def get_links(url):
    """Get all links (subdirectories and files) from a given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to access {url}")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    links = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href not in ("../", "/"):  # Ignore parent directory links
            full_url = urljoin(url, href)
            links.append(full_url)
    return links

def download_file(url, save_path):
    """Download a file with a progress bar."""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    with open(save_path, "wb") as file, tqdm(
        desc=os.path.basename(save_path),
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            bar.update(len(data))

def crawl_and_download(url, save_path):
    """Recursively crawl and download all files from a directory URL."""
    ensure_directory(save_path)
    links = get_links(url)
    
    for link in links:
        parsed = urlparse(link)
        if parsed.path.endswith("/") and "6GB" not in parsed.path:  # If it's a directory, recurse. Exclude the 6GB directory
            subdir_name = os.path.basename(os.path.normpath(parsed.path))
            crawl_and_download(link, os.path.join(save_path, subdir_name))
        else:  # Otherwise, it's a file
            filename = os.path.basename(parsed.path)
            file_path = os.path.join(save_path, filename)
            if not os.path.exists(file_path):  # Avoid re-downloading
                download_file(link, file_path)

        time.sleep(1) # Be polite

# Start crawling and downloading
crawl_and_download(BASE_URL, SAVE_DIR)

print("Download complete!")