In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
# 1. 파일 읽기 및 전처리
csv_file_path = './input/train.csv'
df = pd.read_csv(csv_file_path)

# 'WT'를 0으로 변환
df.replace('WT', 0, inplace=True)

# ID 컬럼 제거
X = df.drop(columns=['ID', 'SUBCLASS'])
y = df['SUBCLASS']

# 데이터 타입 확인 및 혼합된 열 처리
def convert_mixed_columns_to_string(X):
    """숫자와 문자열이 혼합된 열을 모두 문자열로 변환"""
    for col in X.columns:
        if X[col].dtype == 'object':  # 문자열 타입 열
            X[col] = X[col].astype(str)
    return X

X = convert_mixed_columns_to_string(X)

# 나머지 문자열 데이터를 수치형으로 변환
# 모든 문자열 컬럼을 LabelEncoder를 통해 변환 (범주형 데이터 인코딩)
label_encoders = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# SUBCLASS(target)도 LabelEncoder로 변환
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


# 2. 학습/테스트 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# 3. 특성 선택 방법 적용 및 성능 평가 함수
def evaluate_features(X_train, X_test, y_train, y_test, selected_features):
    """ 선택된 feature들로 학습 후 성능을 평가 """
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train[selected_features], y_train)
    y_pred = rf.predict(X_test[selected_features])
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with selected features: {accuracy}")
    return accuracy

## 방법 1: 통계 기반 특성 선택 (ANOVA)
def select_kbest_anova(X_train, X_test, y_train, y_test, k=50):
    anova_selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = anova_selector.fit_transform(X_train, y_train)
    X_test_selected = anova_selector.transform(X_test)
    selected_feature_indices = anova_selector.get_support(indices=True)
    selected_feature_names = X_train.columns[selected_feature_indices]
    print(f"Selected features (ANOVA): {selected_feature_names}")
    return selected_feature_names

## 방법 2: Lasso (L1 Regularization)
def select_lasso(X_train, X_test, y_train, y_test, alpha=0.01):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_scaled, y_train)
    selected_features = X_train.columns[(lasso.coef_ != 0)]
    print(f"Selected features (Lasso): {selected_features}")
    return selected_features

## 방법 3: 랜덤 포레스트 기반 특성 중요도
def select_random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[-50:]  # 상위 50개의 중요한 특성
    selected_features = X_train.columns[indices]
    print(f"Selected features (Random Forest): {selected_features}")
    return selected_features

## 방법 4: PCA (주성분 분석)
def apply_pca(X_train, X_test, n_components=50):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    print(f"Explained variance ratio (PCA): {pca.explained_variance_ratio_}")
    return X_train_pca, X_test_pca

# 예시 실행 및 성능 평가

# ANOVA 방식
selected_features_anova = select_kbest_anova(X_train, X_test, y_train, y_test)
evaluate_features(X_train, X_test, y_train, y_test, selected_features_anova)

# Lasso 방식
selected_features_lasso = select_lasso(X_train, X_test, y_train, y_test)
evaluate_features(X_train, X_test, y_train, y_test, selected_features_lasso)

# Random Forest 방식
selected_features_rf = select_random_forest(X_train, X_test, y_train, y_test)
evaluate_features(X_train, X_test, y_train, y_test, selected_features_rf)

# PCA 방식은 차원을 줄이는 것이므로 RandomForest와 같이 특성 선택으로 바로 평가할 수 없습니다.
X_train_pca, X_test_pca = apply_pca(X_train, X_test)
rf_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)
y_pred_pca = rf_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"Accuracy with PCA components: {accuracy_pca}")


  df.replace('WT', 0, inplace=True)
  357  358  359  360  361  362  363  364  365  366  470  561  570  571
  572  600  601  602  733  856  896  943  948 1007 1058 1099 1140 1167
 1229 1233 1286 1291 1320 1337 1349 1427 1510 1521 1577 1639 1661 1664
 1674 1679 1702 1703 1707 1716 1717 1718 1719 1720 1721 1722 1723 1724
 1725 1726 1738 1873 1876 2012 2072 2077 2152 2153 2185 2214 2278 2304
 2328 2329 2330 2331 2394 2395 2396 2400 2415 2445 2472 2478 2489 2525
 2620 2621 2622 2623 2661 2662 2665 2691 2739 2764 2815 2829 2906 2952
 3005 3011 3031 3032 3071 3138 3159 3179 3240 3257 3263 3294 3325 3343
 3362 3391 3477 3526 3533 3534 3535 3538 3541 3542 3609 3687 3696 3697
 3794 3810 3824 3865 3934 3936 3937 3964 3965 3966 4025 4097 4101 4292
 4296 4332] are constant.
  f = msb / msw


Selected features (ANOVA): Index(['ABCC8', 'ALMS1', 'APC', 'ATRX', 'BRAF', 'BTG1', 'C8B', 'CDKN2A', 'CKB',
       'COL11A1', 'CTNNB1', 'DCC', 'DPYD', 'FBXW7', 'IDH1', 'IDUA', 'KMT2D',
       'LRIG1', 'MXRA5', 'MYH1', 'MYH2', 'MYH4', 'MYH8', 'NFKB2', 'NPM1',
       'NUDT19', 'OSMR', 'PCLO', 'PEX6', 'PGLS', 'PIK3CA', 'PLCB4', 'PTEN',
       'PTGIR', 'PTPRD', 'RELN', 'RYR1', 'RYR2', 'SAMD9', 'SCN10A', 'SCN9A',
       'SOWAHC', 'SPTA1', 'SRD5A1', 'SYNE1', 'THEM4', 'TM7SF2', 'TP53', 'TP63',
       'VHL'],
      dtype='object')
Accuracy with selected features: 0.273972602739726


  model = cd_fast.enet_coordinate_descent(


Selected features (Lasso): Index(['A2M', 'AAAS', 'ABAT', 'ABCA1', 'ABCA4', 'ABCA5', 'ABCA8', 'ABCA9',
       'ABCB1', 'ABCB4',
       ...
       'ZNF185', 'ZNF277', 'ZNF365', 'ZNF639', 'ZNF707', 'ZNFX1', 'ZNRF4',
       'ZPBP', 'ZW10', 'ZYX'],
      dtype='object', length=2893)
Accuracy with selected features: 0.273972602739726
Selected features (Random Forest): Index(['FBN1', 'COL12A1', 'MTOR', 'AHNAK', 'PTPRD', 'RELN', 'COL6A3', 'SCN10A',
       'FBN2', 'PEG3', 'DOCK2', 'COL11A1', 'TG', 'MYH2', 'PABPC1', 'LAMA1',
       'PKHD1', 'MYH4', 'FBXW7', 'NOTCH1', 'PLEC', 'DMD', 'DST', 'HRAS', 'KIT',
       'NF1', 'MXRA5', 'IDH2', 'RYR1', 'SPTA1', 'CDKN2A', 'KMT2D', 'MAP3K1',
       'RYR2', 'SPOP', 'SYNE1', 'PCLO', 'EGFR', 'CTNNB1', 'GATA3', 'CDH1',
       'NPM1', 'PTEN', 'ATRX', 'APC', 'PIK3CA', 'TP53', 'VHL', 'IDH1', 'BRAF'],
      dtype='object')
Accuracy with selected features: 0.27961321514907334
Explained variance ratio (PCA): [0.05730761 0.01007562 0.00947473 0.00927449 0.00851876 0.00