In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Configs
BASE_PATH = '/Users/AnshulSrivastava/Desktop/Fall24/CMSE 492/Project/isic-2024-challenge/'
# PROJECT_PATH = BASE_PATH + 'V4/'

In [18]:
# Load the data
train_df = pd.read_csv(f'{BASE_PATH}/train-metadata.csv')
train_df = train_df.drop(columns=['lesion_id', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm', 
                                  'tbp_lv_location','attribution', 'copyright_license', 'iddx_1', 'anatom_site_general', 'image_type',
                                  'iddx_full', 'tbp_tile_type', 'tbp_lv_dnn_lesion_confidence'])
train_df.head()

  train_df = pd.read_csv(f'{BASE_PATH}/train-metadata.csv')


Unnamed: 0,isic_id,target,patient_id,age_approx,sex,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,...,tbp_lv_norm_color,tbp_lv_perimeterMM,tbp_lv_radial_color_std_max,tbp_lv_stdL,tbp_lv_stdLExt,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_x,tbp_lv_y,tbp_lv_z
0,ISIC_0015670,0,IP_1235828,60.0,male,3.04,20.244422,16.261975,26.922447,23.954773,...,0.0,9.307003,0.0,2.036195,2.63778,0.590476,85,-182.703552,613.493652,-42.427948
1,ISIC_0015845,0,IP_8170065,60.0,male,1.1,31.71257,25.36474,26.331,24.54929,...,0.0,3.354148,0.0,0.853227,3.912844,0.285714,55,-0.078308,1575.687,57.1745
2,ISIC_0015864,0,IP_6724798,60.0,male,3.4,22.57583,17.12817,37.97046,33.48541,...,0.0,8.886309,0.0,1.743651,1.950777,0.361905,105,123.6497,1472.01,232.9089
3,ISIC_0015902,0,IP_4111386,65.0,male,3.22,14.242329,12.164757,21.448144,21.121356,...,1.771705,9.514499,0.66469,1.258541,1.573733,0.209581,130,-141.02478,1442.185791,58.359802
4,ISIC_0024200,0,IP_8313778,55.0,male,2.73,24.72552,20.05747,26.4649,25.71046,...,0.0,6.467562,0.0,2.085409,2.480509,0.313433,20,-72.31564,1488.72,21.42896


In [19]:
# Separate numerical and categorical columns
num_cols = train_df.select_dtypes(include=[np.number]).columns
cat_cols = train_df.select_dtypes(include=[object]).columns
cat_cols = cat_cols.drop(['isic_id', 'patient_id'])
num_cols = num_cols.drop('target')

In [20]:
# Functions
class SexImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.male_proportion = X['sex'].value_counts(normalize=True)['male']
        self.female_proportion = X['sex'].value_counts(normalize=True)['female']
        return self
    
    def transform(self, X):
        X = X.copy()
        X['sex'] = X['sex'].apply(lambda x: np.random.choice(['male', 'female'], p=[self.male_proportion, self.female_proportion]) if pd.isna(x) else x)
        return X

class AgeApproxImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.mean_age_male = X[X['sex'] == 'male']['age_approx'].mean()
        self.mean_age_female = X[X['sex'] == 'female']['age_approx'].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        X.loc[(X['sex'] == 'male') & (X['age_approx'].isna()), 'age_approx'] = self.mean_age_male
        X.loc[(X['sex'] == 'female') & (X['age_approx'].isna()), 'age_approx'] = self.mean_age_female
        return X

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

# Custom transformer for label encoding multiple columns
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns  # List of columns to encode
        self.encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.encoders[col] = le
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = self.encoders[col].transform(X[col])
        return X

    def inverse_transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = self.encoders[col].inverse_transform(X[col])
        return X

In [22]:
# Check for missing values: Print the number of missing values in each column (> 0)
print(train_df.isna().sum()[train_df.isna().sum() > 0])

age_approx     2798
sex           11517
dtype: int64


In [23]:
# Define bins and labels for age discretization
age_bins = [0, 20, 40, 60, 80, 100]
age_labels = ['0-20', '21-40', '41-60', '61-80', '81-100']

In [27]:
# Create a preprocessing pipeline

# Pipeline for numeric columns
numeric_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Scaling numeric features
])

# Pipeline for categorical columns
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))  # One-hot encoding categorical features
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, num_cols),  # Apply numeric pipeline to numeric columns
    ('cat', MultiColumnLabelEncoder(columns=cat_cols), cat_cols)  # Label encode categorical columns
])

# Main pipeline that handles all preprocessing steps
main_pipeline = Pipeline(steps=[
    ('sex_imputer', SexImputer()),  # Custom transformer for imputing 'sex'
    ('age_imputer', AgeApproxImputer()),  # Custom transformer for imputing 'age_approx'
    ('preprocessor', preprocessor)  # Apply preprocessing pipeline to the remaining columns
])

# Fit the pipeline to the training data
train_df_preprocessed = main_pipeline.fit_transform(train_df)

# Convert the transformed data into a DataFrame
numeric_transformed = pd.DataFrame(
    train_df_preprocessed[:, :len(num_cols)], 
    columns=num_cols
)
# Categorical columns passthrough
categorical_transformed = pd.DataFrame(
    train_df_preprocessed[:, len(num_cols):], 
    columns=cat_cols
)

#Add new features
numeric_transformed['lv_size_ratio'] = numeric_transformed['clin_size_long_diam_mm']/numeric_transformed['tbp_lv_minorAxisMM']
numeric_transformed['hue_contrast'] = np.abs(numeric_transformed['tbp_lv_H'] - numeric_transformed['tbp_lv_Hext'])
numeric_transformed['border_complexity'] = numeric_transformed['tbp_lv_norm_border'] + numeric_transformed['tbp_lv_symm_2axis']
numeric_transformed['color_uniformity'] = numeric_transformed['tbp_lv_color_std_mean'] / numeric_transformed['tbp_lv_radial_color_std_max']
numeric_transformed['log_lesion_area'] =  np.log(numeric_transformed['tbp_lv_areaMM2'] + 1)
numeric_transformed['normalized_lesion_size'] = numeric_transformed['clin_size_long_diam_mm'] / numeric_transformed['age_approx']
numeric_transformed['age_normalized_nevi_confidence'] = numeric_transformed['tbp_lv_nevi_confidence'] / numeric_transformed['age_approx']
numeric_transformed['age_size_symmetry_index'] = numeric_transformed['age_approx'] * numeric_transformed['clin_size_long_diam_mm'] * numeric_transformed['tbp_lv_symm_2axis']
categorical_transformed['age_group'] = pd.cut(train_df['age_approx'], bins=age_bins, labels=age_labels)
# # features_to_drop = ['tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext',
# #                     'tbp_lv_Lext', 'tbp_lv_L', 'tbp_lv_deltaLB', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
# #                     'tbp_lv_radial_color_std_max', 'tbp_lv_color_std_mean', 'tbp_lv_symm_2axis', 'tbp_lv_area_perim_ratio',
# #                     'tbp_lv_areaMM2', 'tbp_lv_perimeterMM', 'clin_size_long_diam_mm', 'tbp_lv_minorAxisMM', 
# #                     'tbp_lv_H', 'tbp_lv_Hext']

# #numeric_transformed = numeric_transformed.drop(columns=features_to_drop)

In [28]:
# Combine the transformed numeric and categorical features
train_metadata = pd.concat([numeric_transformed, categorical_transformed], axis=1)

# Add back target, isic_id, patient_id
train_metadata['target'] = train_df['target']
train_metadata['isic_id'] = train_df['isic_id']
train_metadata['patient_id'] = train_df['patient_id']

train_metadata.head()

Unnamed: 0,age_approx,clin_size_long_diam_mm,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,tbp_lv_Bext,tbp_lv_C,tbp_lv_Cext,tbp_lv_H,tbp_lv_Hext,...,log_lesion_area,normalized_lesion_size,age_normalized_nevi_confidence,age_size_symmetry_index,sex,tbp_lv_location_simple,age_group,target,isic_id,patient_id
0,0.147966,-0.511069,0.067613,0.380443,-0.2575,-0.659882,-0.192995,-0.407543,-0.288931,-0.91762,...,-0.813263,-3.453968,-6.275529,-0.171549,1.0,4.0,41-60,0,ISIC_0015670,IP_1235828
1,0.147966,-1.62405,2.935019,2.959583,-0.369545,-0.527265,1.126869,0.906538,-2.708061,-3.006591,...,-1.547855,-10.97585,-6.275957,0.040567,1.0,0.0,41-60,0,ISIC_0015845,IP_8170065
2,0.147966,-0.304536,0.65054,0.625867,1.835454,1.466075,1.644677,1.385391,0.835409,0.339647,...,-0.787368,-2.058154,-6.275909,-0.01985,1.0,5.0,41-60,0,ISIC_0015864,IP_6724798
3,0.516904,-0.407803,-1.433104,-0.780446,-1.294561,-1.291919,-1.583639,-1.355725,0.318926,-0.166282,...,-0.293236,-0.788934,-0.770968,0.163935,1.0,6.0,61-80,0,ISIC_0015902,IP_4111386
4,-0.220972,-0.688917,1.188031,1.455843,-0.344179,-0.268249,0.250792,0.349419,-1.396095,-1.590171,...,-1.094097,3.117664,4.202313,0.008048,1.0,6.0,41-60,0,ISIC_0024200,IP_8313778


In [29]:
# Save the preprocessed data
train_metadata.to_csv(f'{BASE_PATH}/train_processed.csv', index=False)