Remove irrelevant columns, apply logarithmic transformation to columns with long-tail distribution

In [None]:
import pandas as pd
import numpy as np
import requests
import os
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# ================= Configure paths =================
BASE_DIR = '/data1/jiazy/anime'
FILE_PATH = os.path.join(BASE_DIR, 'anime-dataset-2023.csv')
IMAGE_SAVE_DIR = os.path.join(BASE_DIR, 'images')  # Image save directory
OUTPUT_PATH = os.path.join(BASE_DIR, 'anime_preprocessed.csv')
# ===========================================

def download_image(url, save_path):
    """Download a single image, return True on success, False on failure"""
    try:
        if pd.isna(url) or url == 'Unknown':
            return False
        # Set timeout
        response = requests.get(url, timeout=(5, 10))
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
    except Exception:
        pass
    return False

# 1. Load data
if not os.path.exists(FILE_PATH):
    print(f"Error: File not found {FILE_PATH}")
    exit()

print(f"Reading file: {FILE_PATH} ...")
df = pd.read_csv(FILE_PATH)
original_count = len(df)

# ================= Step A: Download images and filter failed samples =================
print(f"Preparing to download images to: {IMAGE_SAVE_DIR}")
if not os.path.exists(IMAGE_SAVE_DIR):
    os.makedirs(IMAGE_SAVE_DIR)

valid_indices = []

if 'anime_id' in df.columns and 'Image URL' in df.columns:
    print("Starting to check/download images (samples with failed downloads will be removed)...")
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Images"):
        img_url = row['Image URL']
        anime_id = row['anime_id']
        save_path = os.path.join(IMAGE_SAVE_DIR, f"{anime_id}.jpg")
        
        is_valid = False
        if os.path.exists(save_path):
            is_valid = True
        else:
            is_valid = download_image(img_url, save_path)
            
        if is_valid:
            valid_indices.append(index)
            
    # Filter valid samples
    df = df.loc[valid_indices].reset_index(drop=True)
    print(f"\nImage processing completed! Remaining samples: {len(df)} (removed {original_count - len(df)} failed samples)")
else:
    print("Warning: 'anime_id' or 'Image URL' column not found, skipping download.")

# ================= Step B: Remove irrelevant columns =================
# Note: Genres has been removed from here
cols_to_drop = [
    'Name', 'English name', 'Other name', 'Synopsis', 
    'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Rank',
    'Image URL' 
]

print("Removing irrelevant columns like Name, Image URL...")
df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

# ================= Step C: Process Genres (keep only the first) =================
if 'Genres' in df.columns:
    print("Processing Genres column (keep only the first genre)...")
    # 1. Fill null values
    df['Genres'] = df['Genres'].fillna('Unknown').astype(str)
    # 2. Split string and take the first, remove leading/trailing spaces
    df['Genres'] = df['Genres'].apply(lambda x: x.split(',')[0].strip())
    
    print(f"Genres processing example: {df['Genres'].head(3).tolist()}")

# ================= Step D: Logarithmic transformation =================
cols_to_log = ['Favorites', 'Popularity', 'Members'] 
print(f"Applying logarithmic transformation (Log1p) to high-variance columns {cols_to_log}...")

for col in cols_to_log:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        df[col] = np.log1p(df[col])

# ================= Step E: Categorical feature encoding (including Genres) =================
# Add Genres to encoding list
cols_to_encode = ['Type', 'Status', 'Source', 'Rating', 'Genres']
le = LabelEncoder()

print("Encoding categorical columns (Label Encoding)...")
for col in cols_to_encode:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown').astype(str)
        df[col] = le.fit_transform(df[col])
        
        # Print encoding mapping example
        mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(f" - {col} (first 3 categories): {list(mapping.items())[:3]}")

# ================= Step F: Save results =================
print("\nProcessing completed! Preview of first 5 rows:")
print(df.head())

df.to_csv(OUTPUT_PATH, index=False)
print(f"\nSaved preprocessed CSV to: {OUTPUT_PATH}")

正在读取文件：/mnt/hdd/jiazy/anime/anime-dataset-2023.csv ...
准备下载图片到: /mnt/hdd/jiazy/anime/images
开始检查/下载图片 (下载失败的样本将被删除)...


Processing Images: 100%|██████████| 24905/24905 [7:19:00<00:00,  1.06s/it]  


图片处理完成！剩余样本数: 24775 (已删除 130 条失败样本)
正在删除 Name, Image URL 等无关列...
正在处理 Genres 列 (只保留第一个题材)...
Genres 处理示例: ['Action', 'Action', 'Action']
正在对高方差列 ['Favorites', 'Popularity', 'Members'] 进行对数变换(Log1p)...
正在对分类列进行编码 (Label Encoding)...
 - Type (前3个类别): [('Movie', 0), ('Music', 1), ('ONA', 2)]
 - Status (前3个类别): [('Currently Airing', 0), ('Finished Airing', 1), ('Not yet aired', 2)]
 - Source (前3个类别): [('4-koma manga', 0), ('Book', 1), ('Card game', 2)]
 - Rating (前3个类别): [('G - All Ages', 0), ('PG - Children', 1), ('PG-13 - Teens 13 or older', 2)]
 - Genres (前3个类别): [('Action', 0), ('Adventure', 1), ('Avant Garde', 2)]

处理完成！前5行数据预览：
   anime_id Score  Genres  Type Episodes  Status  Source       Duration  \
0         1  8.75       0     5     26.0       1       9  24 min per ep   
1         5  8.38       0     0      1.0       1       9    1 hr 55 min   
2         6  8.22       0     5     26.0       1       5  24 min per ep   
3         7  7.25       0     5     26.0       1       9  25 




In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from PIL import Image
from tqdm import tqdm

# 确保 PIL 版本兼容
try:
    LANCZOS_RESAMPLE = Image.Resampling.LANCZOS
except AttributeError:
    LANCZOS_RESAMPLE = Image.LANCZOS

# ================= 1. Configure paths and constants (modify according to your environment) =================
# Root directory
BASE_DIR = "/data1/jiazy/anime"

# Input file (use the preprocessed CSV generated in the previous step)
METADATA_FILE = os.path.join(BASE_DIR, "anime_preprocessed.csv")
IMAGE_DIR = os.path.join(BASE_DIR, "images")

# Output directory (feature save location)
OUTPUT_DIR = "./features"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# ================= 2. Define feature columns =================
# Continuous features (Log transformation already done previously, will perform StandardScale here)
CONTINUOUS_COLS = [
    'Members', 'Favorites', 'Popularity','Episodes', 'Duration', 'Scored By'
]

# Categorical features (LabelEncoder already done previously, will reorganize here to ensure compatibility)
CATEGORICAL_COLS = [
    'Genres', 'Type', 'Status', 'Source', 'Rating'
]

# Label column (regression target)
LABEL_COL = 'Score'

# ID column (for associating images)
ID_COL = 'anime_id'

# Non-feature columns to exclude (if any)
NON_FEATURE_COLS = []

# ================= 3. Image processing helper function =================
def process_and_save_image(img_path_info):
    """
    Receive tuple (original jpg path, target npy path)
    Load JPG -> Resize 224x224 -> Save .npy
    """
    jpg_path, npy_path = img_path_info
    
    # If .npy already exists, return path directly (skip processing)
    if os.path.exists(npy_path):
        return npy_path

    if not os.path.exists(jpg_path):
        return None  # Original image missing

    try:
        # Open image
        img = Image.open(jpg_path)
        
        # Force convert to RGB (prevent PNG transparency channel or grayscale errors)
        if img.mode != 'RGB':
            img = img.convert('RGB')

        # Resize to 224x224
        img_resized = img.resize((224, 224), resample=LANCZOS_RESAMPLE)
        
        # Convert to NumPy array
        np_img = np.array(img_resized)
        
        # Save as .npy
        np.save(npy_path, np_img)
        
        return npy_path
    except Exception as e:
        print(f"Failed to process image {jpg_path}: {e}")
        return None

def preprocess_anime_flow():
    print("=== Starting Anime dataset preprocessing pipeline ===")
    
    # --- 1. Load data ---
    if not os.path.exists(METADATA_FILE):
        raise FileNotFoundError(f"Input file not found: {METADATA_FILE}")
        
    print(f"Loading data {METADATA_FILE}...")
    df = pd.read_csv(METADATA_FILE)
    print(f"Initial record count: {len(df)}")

    # Construct image paths
    df['image_path'] = df[ID_COL].apply(lambda x: os.path.join(IMAGE_DIR, f"{x}.jpg"))
    df['npy_path_target'] = df['image_path'].apply(lambda x: x.replace(".jpg", ".npy"))

    # --- 2. Data cleaning (key modification part) ---
    print("Cleaning data...")
    
    # === A. Clean label column (LABEL_COL) ===
    # Must execute this step first, convert 'UNKNOWN' to NaN, then dropna
    print(f"Cleaning label column {LABEL_COL} (force convert to numeric, remove invalid labels)...")
    
    # 1. Force convert to numeric, 'UNKNOWN' becomes NaN
    df[LABEL_COL] = pd.to_numeric(df[LABEL_COL], errors='coerce')
    
    # 2. Delete rows with NaN labels (data without Ground Truth cannot be used for training)
    original_len = len(df)
    df = df.dropna(subset=[LABEL_COL])
    print(f" - Removed {original_len - len(df)} samples due to missing labels")
    
    # 3. Ensure label is float type
    df[LABEL_COL] = df[LABEL_COL].astype(float)
    # =================================
    
    # === B. Clean continuous feature columns (CONTINUOUS_COLS) ===
    print(f"Cleaning continuous features (fill UNKNOWN with mode): {CONTINUOUS_COLS} ...")
    for col in CONTINUOUS_COLS:
        # 1. Force convert to numeric
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # 2. Calculate mode
        if not df[col].mode().empty:
            mode_val = df[col].mode()[0]
        else:
            mode_val = 0
        # 3. Fill NaN
        df[col] = df[col].fillna(mode_val).astype(float)
    # ===========================================

    # --- 3. Process categorical features ---
    print("Processing categorical features...")
    cat_dims = [] 
    for col in CATEGORICAL_COLS:
        # Categorical features also need to guard against dirty data
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        
        df[col] = df[col].astype('category')
        df[col] = df[col].cat.codes
        
        dim = int(df[col].max() + 1)
        cat_dims.append(dim)
        print(f" - {col}: {dim} classes")

    # --- 4. Dataset splitting ---
    print("Performing 80:10:10 random split...")
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    print(f"Split results: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

    # --- 5. Continuous feature standardization ---
    if CONTINUOUS_COLS:
        print(f"Standardizing continuous features: {CONTINUOUS_COLS} ...")
        scaler = StandardScaler()
        scaler.fit(train_df[CONTINUOUS_COLS])
        
        train_df[CONTINUOUS_COLS] = scaler.transform(train_df[CONTINUOUS_COLS])
        val_df[CONTINUOUS_COLS] = scaler.transform(val_df[CONTINUOUS_COLS])
        test_df[CONTINUOUS_COLS] = scaler.transform(test_df[CONTINUOUS_COLS])

    # --- 6. Image processing ---
    print("Processing images (Resize 224x224 -> .npy)...")
    for split_name, split_df in zip(['Train', 'Val', 'Test'], [train_df, val_df, test_df]):
        path_pairs = zip(split_df['image_path'], split_df['npy_path_target'])
        results = []
        for pair in tqdm(path_pairs, total=len(split_df), desc=f"Processing {split_name}"):
            results.append(process_and_save_image(pair))
        split_df['final_npy_path'] = results

    # --- 7. Filter again ---
    train_df = train_df.dropna(subset=['final_npy_path'])
    val_df = val_df.dropna(subset=['final_npy_path'])
    test_df = test_df.dropna(subset=['final_npy_path'])

    # --- 8. Save output ---
    print(f"Saving final features to {OUTPUT_DIR} ...")

    # Save tabular_lengths
    tabular_lengths = cat_dims + [1] * len(CONTINUOUS_COLS)
    torch.save(tabular_lengths, os.path.join(OUTPUT_DIR, "tabular_lengths.pt"))

    for split_name, df_split in zip(["train", "val", "test"], [train_df, val_df, test_df]):
        # A. Features CSV
        features_path = os.path.join(OUTPUT_DIR, f"{split_name}_features.csv")
        cols_to_save = CATEGORICAL_COLS + CONTINUOUS_COLS
        df_split[cols_to_save].to_csv(features_path, index=False, header=False)
        
        # B. Labels Tensor (this previously had errors, should be fixed now)
        labels_path = os.path.join(OUTPUT_DIR, f"{split_name}_labels.pt")
        # At this point df_split[LABEL_COL] is already float type, won't error anymore
        labels_tensor = torch.tensor(df_split[LABEL_COL].values, dtype=torch.float32)
        torch.save(labels_tensor, labels_path)
        
        # C. Paths List
        paths_path = os.path.join(OUTPUT_DIR, f"{split_name}_paths.pt")
        npy_path_list = df_split['final_npy_path'].tolist()
        torch.save(npy_path_list, paths_path)

    print("\n=== Anime preprocessing completed! ===")
    print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}")


if __name__ == "__main__":
    preprocess_anime_flow()

  from .autonotebook import tqdm as notebook_tqdm


=== 开始执行 Anime 数据集预处理流程 ===
正在加载数据 /data1/jiazy/anime/anime_preprocessed.csv...
初始记录数: 24775
正在清洗数据...
正在清洗标签列 Score (强制转数字，删除无效标签)...
 - 因标签缺失删除了 9133 条样本
正在清洗连续特征 (将 UNKNOWN 填充为众数): ['Members', 'Favorites', 'Popularity', 'Episodes', 'Duration', 'Scored By'] ...
正在处理类别特征...
 - Genres: 22 类
 - Type: 7 类
 - Status: 2 类
 - Source: 17 类
 - Rating: 7 类
正在进行 80:10:10 随机划分...
划分结果: Train=12513, Val=1564, Test=1565
正在标准化连续特征: ['Members', 'Favorites', 'Popularity', 'Episodes', 'Duration', 'Scored By'] ...
正在处理图像 (Resize 224x224 -> .npy)...


Processing Train: 100%|██████████| 12513/12513 [00:00<00:00, 346088.78it/s]
Processing Val: 100%|██████████| 1564/1564 [00:00<00:00, 330539.73it/s]
Processing Test: 100%|██████████| 1565/1565 [00:00<00:00, 330384.83it/s]

正在保存最终特征到 ./features ...






=== Anime 预处理全部完成！ ===
输出目录: /data1/jiazy/anime/features
