In [None]:
# !pip install requests

In [None]:
import pandas as pd
import numpy as np
import ast
import re
import os
import requests
from urllib.parse import urlparse


df = pd.read_csv('../../public/data/digimons.csv')

# Tratamentos:

# Remoção de coluna 'type' que só tem valor NaN:
df = df.drop(['type'], axis=1)

# Normalizando a coluna 'name':
df['name'] = df['name'].str.replace('_', ' ', regex=False)

def normalize_name(name):
    # 1. Se tiver "(Black)", trata antes de qualquer limpeza
    if re.search(r"\(Black\)", name, re.IGNORECASE):
        base = re.sub(r"\s*\(.*?\)", "", name).strip()
        return base + " Black"

    # 2. Remover "Core"
    name = re.sub(r"\bCore\b", "", name).strip()

    # 3. Remover qualquer coisa entre parênteses
    name = re.sub(r"\s*\(.*?\)", "", name).strip()

    return name

df['name'] = df['name'].apply(normalize_name)

df = df[df['name'] != 'Agumon Black'] # Removing Agumon Black row:


# Remove duplicados
df = df.drop_duplicates(subset='name', keep='first').reset_index(drop=True)

# Normalizando os valores da coluna 'level'
def normalize_level(value):
    if pd.isna(value) or str(value).strip().lower() == 'unidentified':
        return 'Unknown'
    
    value = str(value).strip()
    
    # Remove chaves e pega a primeira palavra antes da vírgula
    if value.startswith('{') and value.endswith('}'):
        value = value[1:-1].split(',')[0].strip()
    
    # Pega a primeira palavra antes de '/' ou espaço
    value = value.split('/')[0].strip()
    value = value.split()[0].strip()
    
    return value

df['level'] = df['level'].apply(normalize_level)

# Normalizando a coluna 'attribute':
PRIMARY = ['Vaccine', 'Virus', 'Data']
SECONDARY = ['Free', 'Variable', 'No Attribute', 'Unknown']

def normalize_attribute(value):
    if pd.isna(value):
        return 'Unknown'
    
    value = str(value).strip()
    
    # Substituir No Data ou None ou Unidentified por Unknown
    if value.upper() in ['NO DATA', 'NONE', 'UNIDENTIFIED']:
        return 'Unknown'
    
    # Remover chaves { } se existirem
    if value.startswith('{') and value.endswith('}'):
        value = value[1:-1]
    
    # Transformar em lista de atributos, removendo espaços
    attrs = [v.strip() for v in value.split(',')]
    
    # Separar primários e secundários
    prim = [v for v in attrs if v in PRIMARY]
    sec = [v for v in attrs if v in SECONDARY]
    
    # Seleciona apenas o primeiro primário
    prim_attr = prim[0] if prim else None
    
    # Seleciona o primeiro secundário válido
    sec_attr = None
    for s in sec:
        if s not in ['Unidentified', 'Unknown']:
            sec_attr = s
            break
    
    # Construir valor final
    if prim_attr and sec_attr:
        return f'{prim_attr} / {sec_attr}'
    elif prim_attr:
        return prim_attr
    elif sec_attr:
        return sec_attr
    else:
        return 'Unknown'

df['attribute'] = df['attribute'].apply(normalize_attribute)

# Normalizando a coluna family:
def normalize_family(value):
    if pd.isna(value) or value.strip() == '[]':
        return 'Unknown'
    
    value = value.strip()
    
    # Se for uma lista representada como string
    if value.startswith('[') and value.endswith(']'):
        try:
            lst = ast.literal_eval(value)
            if lst:
                return lst[0]
            else:
                return 'Unknown'
        except:
            return 'Unknown'
    else:
        # Caso seja apenas uma palavra ou string simples entre aspas
        return value.strip('"')

df['family'] = df['family'].apply(normalize_family)

# Normalizando 'prior_forms':
def normalize_prior_forms(val):
    # Se for string, tenta converter para lista
    if isinstance(val, str):
        try:
            val = ast.literal_eval(val)
        except:
            val = []
    # Se não for lista, transforma em lista vazia
    if not isinstance(val, list):
        val = []
    
    # Se tiver elementos, pega só o primeiro
    if val:
        return val[0]
    return None

df["prior_forms"] = df["prior_forms"].apply(normalize_prior_forms)

# Normalizando as colunas 'next_forms' e 'lateral_next_forms':
def format_list_column(val):
    # Se for string, tenta converter para lista
    if isinstance(val, str):
        try:
            val = ast.literal_eval(val)
        except:
            val = []
    if not isinstance(val, list):
        val = []
    
    if not val:  # lista vazia
        return None
    if len(val) == 1:  # só um elemento
        return val[0]
    return " | ".join(val)  # mais de um elemento

# Aplicando nas colunas
df["next_forms"] = df["next_forms"].apply(format_list_column)
df["lateral_next_forms"] = df["lateral_next_forms"].apply(format_list_column)


# Normalziando a coluna'digifuse_forms':
pattern = re.compile(r'(?i)\bdigifuse[s]?\s*chart[s]?\b')  # detecta DigiFuse Chart / Charts (case-insensitive)

def clean_digifuse(val):
    # nulos
    if pd.isna(val):
        return None

    # construir lista de itens a partir de diferentes formatos
    lst = []
    if isinstance(val, str):
        s = val.strip()
        # caso seja string que representa lista: '["A", "B"]'
        if s.startswith('[') and s.endswith(']'):
            try:
                lst = ast.literal_eval(s)
            except Exception:
                # fallback: tentar extrair entre colchetes e split por vírgula
                inner = s[1:-1]
                lst = [x.strip().strip('"').strip("'") for x in inner.split(',') if x.strip()]
        # caso já esteja juntado por " | "
        elif '|' in s:
            lst = [x.strip() for x in s.split('|') if x.strip()]
        else:
            # string simples
            lst = [s]
    elif isinstance(val, (list, tuple)):
        lst = list(val)
    else:
        # qualquer outro tipo: tenta converter pra string única
        lst = [str(val)]

    # normalizar e filtrar itens "DigiFuse Chart(s)"
    cleaned = []
    for item in lst:
        if item is None:
            continue
        it = str(item).strip().strip('"').strip("'").strip()
        if pattern.search(it):
            continue
        if it: 
            cleaned.append(it)

    if not cleaned:
        return None
    if len(cleaned) == 1:
        return cleaned[0]
    return " | ".join(cleaned)


# df["digifuse_forms"] = df["digifuse_forms"].apply(format_digifuse_column)
df['digifuse_forms'] = df['digifuse_forms'].apply(clean_digifuse)

# Normalizando a coluna 'attacks':
def normalize_attacks(val):
    if pd.isna(val):
        return None

    attacks = []

    # tentar interpretar como lista de dicionários
    if isinstance(val, str):
        val = val.strip()
        if val.startswith('[') and val.endswith(']'):
            try:
                lst = ast.literal_eval(val)
            except Exception:
                return 'Unknown'
        else:
            return 'Unknown'
    elif isinstance(val, list):
        lst = val
    else:
        return 'Unknown'

    # extrair apenas os nomes
    for item in lst:
        if isinstance(item, dict) and 'name' in item:
            name = str(item['name']).strip()
            if name:
                attacks.append(name)

    if not attacks:
        return 'Unknown'

    result = " | ".join(attacks)

    # substitui caso o único valor seja Digimon Story
    if len(attacks) == 1 and "Digimon Story" in attacks[0]:
        return "Unknown"

    return result


df['attacks'] = df['attacks'].apply(normalize_attacks)

#------------------------------------------------------------------------------------------------------------------------------------------------------#
attacks_list = []

for i in range(df.shape[0]):

    splitted_list = df['attacks'].iloc[i].split('|')
    for word in splitted_list:
        attacks_list.append(word)
        
while "Unknown" in attacks_list:
    attacks_list.remove("Unknown")

df.to_csv('../../public/data/digimons_filtered.csv', index=False)


In [5]:
import os

folder_path = "imagens_digimon"
count = sum(1 for entry in os.scandir(folder_path) if entry.is_file())
print("Number of files:", count)

Number of files: 1386


In [None]:
# # Caminho do seu CSV
# csv_path = '../../public/data/digimons_filtered.csv'

# # Pasta onde as imagens serão salvas
# output_dir = "imagens_digimon"
# os.makedirs(output_dir, exist_ok=True)

# # Carregar CSV
# df = pd.read_csv(csv_path)

# # Percorrer cada linha
# for idx, row in df.iterrows():
#     url = row["image"]
#     name = row["name"]

#     if pd.isna(url) or not isinstance(url, str):
#         print(f"[SKIP] {name} não tem URL válida")
#         continue

#     try:
#         # Fazer requisição
#         response = requests.get(url, timeout=10)
#         response.raise_for_status()

#         # Descobrir extensão do arquivo
#         path = urlparse(url).path
#         ext = os.path.splitext(path)[1]
#         if not ext:
#             ext = ".jpg"

#         # Nome do arquivo = nome do Digimon sem espaços
#         file_name = f"{name.replace(' ', '_')}{ext}"
#         file_path = os.path.join(output_dir, file_name)

#         # Salvar arquivo
#         with open(file_path, "wb") as f:
#             f.write(response.content)

#         print(f"[OK] {name} salvo em {file_path}")

#     except Exception as e:
#         print(f"[ERRO] Não consegui baixar {name} ({url}): {e}")
