In [None]:
import ftplib
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm
import time
import json
from huggingface_hub import HfApi, HfFolder, Repository
from io import BytesIO
from tqdm import tqdm

This notebook is used download the files from FTP server and upload them to huggingface. This notebook was run across multiple sessions for faster downloading and uploading. The uploaded repo is later downloaded to Kaggle as well

In [None]:
%%capture
!pip install python-dotenv

In [None]:
from dotenv import load_dotenv
import os

load_dotenv("/kaggle/input/codefiles/.env")

In [None]:
HF_API_KEY = os.getenv("HF_API_KEY")
HF_API_KEY

In [None]:
with open("/kaggle/input/codefiles/train.json" , "r") as f:
    data = json.load(f)

def get_organ(filename):

    organ = None
    for item in data:
        if filename in item["id"]:
            organ =  item["report"].split(",")[0]
            break
        else:
            continue


    if organ:
        return organ
    else:
        # print(filename)
        return "No organ"



In [None]:

print("🚀 REG2025 Dataset Downloader")
print("="*70)

SERVERS = ["211.54.28.103", "211.54.28.104", "211.54.28.105"]
USERNAME = "ftpuser"
PASSWORD = "ftp12#$User"

DOWNLOAD_DIR = "/kaggle/working/files"

ftp = None
for server in SERVERS:
    try:
        ftp = ftplib.FTP()
        ftp.connect(server, 21)
        ftp.login(USERNAME, PASSWORD)
        print(f"✅ Connected to {server}")
        break
    except Exception as e:
        print(f"❌ Failed to connect to {server}: {e}")
        continue

if ftp is None:
    print("❌ Could not connect to any FTP server")
    raise Exception("FTP connection failed")


In [None]:

# Cell 5: Get All TIFF Files Information
print("\n" + "="*70)
print("📊 ANALYZING TIFF FILE SIZES")
print("="*70)

try:
    ftp.cwd('/REG_train')
    print("🔍 Scanning /REG_train directory...")

    files_info = []

    def parse_tiff_line(line):
        parts = line.split()
        if len(parts) >= 9:
            filename = parts[-1]
            if filename.lower().endswith('.tif') or filename.lower().endswith('.tiff'):
                try:
                    size_bytes = int(parts[4])
                    size_mb = size_bytes / (1024 * 1024)
                    size_gb = size_bytes / (1024 * 1024 * 1024)
                    files_info.append({
                        'filename': filename,
                        'size_bytes': size_bytes,
                        'size_mb': size_mb,
                        'size_gb': size_gb
                    })
                except ValueError:
                    print(f"⚠️  Could not parse size for {filename}")

    print("📋 Fetching file list...")
    ftp.retrlines('LIST', parse_tiff_line)

    print(f"✅ Found {len(files_info)} TIFF files")

    # Sort by size (increasing order)
    files_info.sort(key=lambda x: x['size_gb'])

except Exception as e:
    print(f"❌ Error getting file info: {e}")
    files_info = []


In [None]:
thresholds = {
    "Breast": 850,
    "Lung": 850,
    "Prostate": 850,
    "Urinary bladder": 850,
    "Stomach": 850,
    
    "Uterine cervix": 600,
    
    "Colon": 600,
    "Rectum": 200,
}



target = {organ: 0 for organ in thresholds}
files_needed = []

for file in files_info:
    organ = get_organ(file["filename"])

    if organ not in thresholds:
        continue

    if target[organ] < thresholds[organ]:
        target[organ] += 1
        files_needed.append(file)



In [None]:
thresholds , target , len(files_needed) , sum(thresholds.values()) , sum(target.values())

In [None]:
file_names = [file["filename"] for file in files_needed]
len(file_names)

In [None]:
mb , gb  = 0, 0 

l = []
for i,x in enumerate(files_needed[:]):
    
    mb+= x["size_mb"]
    gb+= x["size_gb"]
    l.append(x["size_gb"])

mb , gb

In [None]:
n = """ 

Sum of elements 0 to 499: 7.7228741915896535
Sum of elements 500 to 999: 12.840961865149438
Sum of elements 1000 to 1499: 22.48379500117153
Sum of elements 1500 to 1999: 31.311513166874647
Sum of elements 2000 to 2499: 40.36451400630176
Sum of elements 2500 to 2999: 50.51780404802412
Sum of elements 3000 to 3499: 64.21510020084679
Sum of elements 3500 to 3999: 81.95702343713492
Sum of elements 4000 to 4499: 100.29196056723595
Sum of elements 4500 to 4999: 120.24520295485854
Sum of elements 5000 to 5499: 146.38445267640054
Sum of elements 5500 to 5999: 58.87968944571912

"""

In [None]:
start,end =  3500 , 3550
i = 3

In [None]:
from PIL import Image

In [None]:
from huggingface_hub import snapshot_download, login
import os

login(HF_API_KEY)

In [None]:
from huggingface_hub import create_repo , upload_folder

username = "aneeshm44"
model_repo_name = f"reg{i}"

dataset_repo_id = f"{username}/{model_repo_name}"

create_repo(
    repo_id=dataset_repo_id,
    repo_type="dataset",
    exist_ok=True,
    private = False
)


In [None]:
import os
import shutil
import tqdm
import time
import ftplib

DOWNLOAD_DIR = "/kaggle/working"
STORAGE_LIMIT_GB = 18
STORAGE_LIMIT_BYTES = STORAGE_LIMIT_GB * 1024 * 1024 * 1024
TIME_LIMIT_HOURS = 11
TIME_LIMIT_SECONDS = (TIME_LIMIT_HOURS * 3600) + (45*60)

commit_counter = 0

# Lists to track failed operations
failed_downloads = []
failed_uploads = []

def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if os.path.exists(filepath):
                total_size += os.path.getsize(filepath)
    return total_size

def cleanup_and_upload():
    global commit_counter, ftp
    commit_counter += 1
    
    current_size = get_directory_size(DOWNLOAD_DIR)
    current_size_gb = current_size / (1024 * 1024 * 1024)
    file_count = len([f for f in os.listdir(DOWNLOAD_DIR) if os.path.isfile(os.path.join(DOWNLOAD_DIR, f))])
    
    print(f"Storage limit reached. Uploading files and cleaning up...")
    
    try:
        os.remove("/kaggle/working/__notebook__.ipynb")
    except Exception as e:
        print(f"Could not remove notebook file: {e}")
    
    try:
        upload_folder(
            folder_path=DOWNLOAD_DIR,
            repo_id=dataset_repo_id,
            repo_type="dataset",
            commit_message=f"Commit {commit_counter}: {current_size_gb:.2f}GB with {file_count} files {start} to {end}",
            ignore_patterns=["*.ipynb", "__notebook__.ipynb"],
        )
        print(f"Upload successful for commit {commit_counter}")
    except Exception as e:
        print(f"Upload failed for commit {commit_counter}: {e}")
        # Add all files in directory to failed uploads list
        for filename in os.listdir(DOWNLOAD_DIR):
            if filename not in ["__notebook__.ipynb"] and not filename.endswith(".ipynb"):
                failed_uploads.append(f"commit_{commit_counter}_{filename}")
    
    # Re-establish FTP connection after upload (connection might be lost during upload)
    print("🔄 Re-establishing FTP connection after upload...")
    SERVERS = ["211.54.28.103", "211.54.28.104", "211.54.28.105"]
    USERNAME = "ftpuser"
    PASSWORD = "ftp12#$User"
    
    # Close existing connection if it exists
    if ftp:
        try:
            ftp.quit()
        except:
            pass
    
    ftp = None
    for server in SERVERS:
        try:
            import ftplib
            ftp = ftplib.FTP()
            ftp.connect(server, 21)
            ftp.login(USERNAME, PASSWORD)
            print(f"✅ Reconnected to {server}")
            break
        except Exception as e:
            print(f"❌ Failed to reconnect to {server}: {e}")
            continue
    
    if ftp is None:
        print("❌ Could not reconnect to any FTP server")
        raise Exception("FTP reconnection failed")
    
    # Change to the correct directory
    try:
        ftp.cwd('/REG_train')
        print("📁 Changed to /REG_train directory")
    except Exception as e:
        print(f"❌ Failed to change to /REG_train directory: {e}")
        raise Exception("FTP directory change failed")
    
    for filename in os.listdir(DOWNLOAD_DIR):
        file_path = os.path.join(DOWNLOAD_DIR, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')
    
    print(f"Cleanup completed. Directory cleared.")

start_time = time.time()

for i, file in enumerate(tqdm.tqdm(file_names[start:end]), start):
    
    if time.time() - start_time >= TIME_LIMIT_SECONDS:
        remaining_files = len(file_names[start:end]) - (i - start)
        print(f"Time limit of {TIME_LIMIT_HOURS} hours reached. Stopping at index {i}.")
        print(f"Files remaining: {remaining_files}")
        break
    
    current_size = get_directory_size(DOWNLOAD_DIR)
    current_size_gb = current_size / (1024 * 1024 * 1024)
    
    if current_size >= STORAGE_LIMIT_BYTES:
        cleanup_and_upload()
    
    local_path = os.path.join(DOWNLOAD_DIR, file)
    try:
        with open(local_path, 'wb') as file_handle:
            ftp.retrbinary(f'RETR {file}', file_handle.write)
        print(f"Downloaded: {file} (Index: {i}, Directory size: {current_size_gb:.2f}GB)")
    except Exception as e:
        print(f"Failed to download {file}: {e}")
        failed_downloads.append(file)
        continue

# Final upload
if os.listdir(DOWNLOAD_DIR):
    commit_counter += 1
    current_size = get_directory_size(DOWNLOAD_DIR)
    current_size_gb = current_size / (1024 * 1024 * 1024)
    file_count = len([f for f in os.listdir(DOWNLOAD_DIR) if os.path.isfile(os.path.join(DOWNLOAD_DIR, f))])
    
    print("Uploading final batch of files...")
    
    try:
        os.remove("/kaggle/working/__notebook__.ipynb")
    except Exception as e:
        print(f"Could not remove notebook file: {e}")
    
    try:
        upload_folder(
            folder_path=DOWNLOAD_DIR,
            repo_id=dataset_repo_id,
            repo_type="dataset",
            commit_message=f"Final commit {commit_counter}: {current_size_gb:.2f}GB with {file_count} files {start} to {end}",
            ignore_patterns=["*.ipynb", "__notebook__.ipynb"],
        )
        print(f"Final upload successful")
    except Exception as e:
        print(f"Final upload failed: {e}")
        # Add all files in directory to failed uploads list
        for filename in os.listdir(DOWNLOAD_DIR):
            if filename not in ["__notebook__.ipynb"] and not filename.endswith(".ipynb"):
                failed_uploads.append(f"final_commit_{filename}")

# Print summary of failed operations
print("\n" + "="*50)
print("OPERATION SUMMARY")
print("="*50)

if failed_downloads:
    print(f"\nFAILED DOWNLOADS ({len(failed_downloads)} files):")
    for filename in failed_downloads:
        print(f"  - {filename}")
else:
    print("\nAll downloads completed successfully!")

if failed_uploads:
    print(f"\nFAILED UPLOADS ({len(failed_uploads)} files):")
    for filename in failed_uploads:
        print(f"  - {filename}")
else:
    print("\nAll uploads completed successfully!")

print(f"\nTotal commits attempted: {commit_counter}")
print("="*50)