In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
import numpy as np

# path_ADNI = 'C:/Users/Ali/Code/TransTAB/TransTab-Repo3/Dataset3/data_processed.csv'
# path2_Credit = 'C:\Users\Ali\Code\TransTAB\TransTab-Repo3\credit-approval\data_processed.csv'

# Load your dataset
df = pd.read_csv(r'C:/Users/Ali/Code/TransTAB/TransTab-Repo3/Dataset3/data_processed.csv')

# Identify numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns = df.drop(columns=['target_label']).select_dtypes(include=['object']).columns.tolist()

# Separate target and features
target = 'target_label'  # Update this if your target column has a different name
X = df.drop(target, axis=1)
y = df[target]

# Impute missing values for numerical data
knn_imputer = KNNImputer(n_neighbors=5)
X_numerical = knn_imputer.fit_transform(X[numerical_columns])

# Impute missing values for categorical data
simple_imputer = SimpleImputer(strategy='most_frequent')
X_categorical = simple_imputer.fit_transform(X[categorical_columns])

# Encode categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_categorical_encoded = encoder.fit_transform(X_categorical)

# Combine numerical and encoded categorical features

X_combined = np.hstack((X_numerical, X_categorical_encoded))

# Apply SMOTE to the combined dataset to balance classes
smote = SMOTE(sampling_strategy='auto')
X_res, y_res = smote.fit_resample(X_combined, y.map({'AD': 1, 'CN': 0}))  # Ensure y is encoded as numeric

# Convert resampled X_res back to a DataFrame
columns = numerical_columns + list(encoder.get_feature_names_out(categorical_columns))
resampled_df = pd.DataFrame(X_res, columns=columns)

# Add the resampled target back to your DataFrame
resampled_df[target] = y_res.map({1: 'AD', 0: 'CN'})


# Save the balanced dataset to a new CSV file
resampled_file_path = 'resampled_dataset.csv'
resampled_df.to_csv(resampled_file_path, index=False)

