In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')


In [None]:
train_df

In [None]:

# Assuming the target variable is named 'label'
X_train = train_df.drop(columns=['SUBCLASS'])
y_train = train_df['SUBCLASS']
X_test = test_df
y_test = test_df['SUBCLASS']


In [None]:

# Step 1: Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [None]:

# Step 2: Standardize the data
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Step 3: Dimensionality Reduction using PCA
pca = PCA(n_components=50)  # Adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train_res_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Step 4: Train LightGBM model
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_pca, y_train_res)

# Evaluate the model
y_pred = lgbm.predict(X_test_pca)
print(classification_report(y_test, y_pred))

# Step 5: Hyperparameter tuning (Optional)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 50]
}
grid_search = GridSearchCV(LGBMClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_pca, y_train_res)

# Best model evaluation
best_lgbm = grid_search.best_estimator_
y_pred_best = best_lgbm.predict(X_test_pca)
print(classification_report(y_test, y_pred_best))


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report

# Load the dataset
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 2: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[2:]  # Exclude 'ID' and 'SUBCLASS'
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 3: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 4: Dimensionality Reduction using PCA (Optional, but recommended for high-dimensional data)
pca = PCA(n_components=100)  # Reduce to 100 components (tune this number as needed)
X_pca = pca.fit_transform(train_df[mutation_cols])


In [None]:

# Step 5: Prepare final dataset
X = X_pca  # Features after PCA
y = train_df['SUBCLASS']  # Target labels

# Step 6: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build the MLP model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Input and first hidden layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(Dense(64, activation='relu'))  # Second hidden layer
model.add(Dropout(0.3))  # Dropout layer
model.add(Dense(32, activation='relu'))  # Third hidden layer
model.add(Dense(len(le.classes_), activation='softmax'))  # Output layer (softmax for multiclass)


In [3]:

# Step 8: Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:

# Step 9: Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))



In [None]:

# Step 10: Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)


In [None]:

# Step 11: Print the classification report
print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

# Optional: Plot the training and validation accuracy and loss
import matplotlib.pyplot as plt

# Plot accuracy
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()

# Plot loss
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()

In [None]:
train_df

In [None]:
# Reload the training dataset to get the mutation columns if necessary
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: Define the mutation columns (excluding 'ID' and 'SUBCLASS')
mutation_cols = train_df.columns[2:]

# Step 2: Load the test dataset
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 3: Encode the mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 4: Standardize the mutation features (using the same scaler fit on training data)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# Step 5: Dimensionality Reduction using PCA (using the same PCA fit on training data)
X_test_pca = pca.transform(test_df[mutation_cols])

# Step 6: Predict using the trained model
y_test_pred = model.predict(X_test_pca)
y_test_pred_classes = y_test_pred.argmax(axis=1)

# Step 7: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred_classes)

# Step 8: Create a DataFrame to store the predictions
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Step 9: Display the first few rows of predictions
test_df[['ID', 'Predicted_SUBCLASS']].head()



In [17]:
test_df[['ID', 'Predicted_SUBCLASS']].to_csv('submit01.csv',index=False)
test_df[['Predicted_SUBCLASS']].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 2: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[2:]
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 3: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 4: Feature Selection - Variance Threshold
# Remove low variance features (those that do not vary much)
threshold = 0.01  # Adjust threshold based on data characteristics
selector = VarianceThreshold(threshold=threshold)
X_reduced = selector.fit_transform(train_df[mutation_cols])

print(f"Reduced feature set after Variance Threshold: {X_reduced.shape[1]} features")

# Step 5: Train XGBoost to find important features
X = X_reduced
y = train_df['SUBCLASS']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.feature_importances_
sorted_idx = feature_importances.argsort()

# Select top 2000 features based on importance
top_n = 2000  # Adjust this value to select the top N important features
X_train_reduced = X_train[:, sorted_idx[-top_n:]]
X_test_reduced = X_test[:, sorted_idx[-top_n:]]

# Step 6: Train a model on the reduced feature set (after selecting important features)
model = xgb.XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train_reduced, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the reduced feature set: {accuracy}")

# Optional: Get feature importances
important_features = sorted_idx[-top_n:]
print(f"Selected Top {top_n} features: {important_features}")


In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load dataset
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 2: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[2:]
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 3: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 4: Split the data into training and testing sets
X = train_df[mutation_cols]
y = train_df['SUBCLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train an XGBoost model to find important features
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb_model.feature_importances_
sorted_idx = feature_importances.argsort()

# Select top 2000 features based on importance
top_n = 2000  # Adjust this value to select the top N important features
top_features_idx = sorted_idx[-top_n:]

# Step 6: Reduce training and testing data to the top N features
X_train_reduced = X_train.iloc[:, top_features_idx]
X_test_reduced = X_test.iloc[:, top_features_idx]

# Step 7: Train a new XGBoost model on the reduced feature set
best_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)
best_xgb.fit(X_train_reduced, y_train)

# Step 8: Evaluate the model on the test set
y_pred = best_xgb.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the reduced feature set: {accuracy}")

# Step 9: Load test.csv for prediction
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Preprocess the test data (same as train data preprocessing)
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# Reduce test data to the top N important features
X_test_final = test_df[mutation_cols].iloc[:, top_features_idx]

# Step 10: Predict using the trained model
y_test_pred = best_xgb.predict(X_test_final)

# Step 11: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Step 12: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['ID', 'Predicted_SUBCLASS']].head())


FileNotFoundError: [Errno 2] No such file or directory: './input/train_sample.csv'

0.239796172

In [None]:
test_df[['ID', 'Predicted_SUBCLASS']].to_csv('submit02.csv',index=False)
test_df[['Predicted_SUBCLASS']].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score

# 데이터 로드 및 전처리 (이전 코드와 동일)
train_sample_path = '/mnt/data/train_sample.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 2: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[2:]
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 3: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 4: Split the data into training and testing sets
X = train_df[mutation_cols]
y = train_df['SUBCLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 기본 모델 설정
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200, 300],  # 트리 개수
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'max_depth': [3, 5, 7],  # 트리의 최대 깊이
    'min_child_weight': [1, 3, 5],  # 리프 노드에서 필요한 최소 가중치 합
    'subsample': [0.6, 0.8, 1.0],  # 샘플링 비율
    'colsample_bytree': [0.6, 0.8, 1.0]  # 각 트리에서 사용할 특징의 비율
}

# GridSearchCV를 사용해 최적의 하이퍼파라미터 탐색
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 정확도 출력
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")

# 최적의 하이퍼파라미터로 모델을 다시 학습
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train, y_train)

# 테스트 세트에서 모델 평가
y_pred = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy}")


In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import numpy as np

# 데이터 로드 및 전처리 (이전 코드와 동일)
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: 'ID' 컬럼 제거 (학습에 불필요한 열)
train_df = train_df.drop(columns=['ID'])

# Step 2: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 3: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[1:]  # 'SUBCLASS' 이후 모든 열이 변이 정보
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 4: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 5: 상관계수 기반으로 특징 선택 (상관계수가 낮은 특징 제거)
correlation_matrix = np.abs(train_df.corr())
correlation_with_target = correlation_matrix['SUBCLASS'].drop('SUBCLASS')
top_features = correlation_with_target[correlation_with_target > 0.1].index
X = train_df[top_features]
y = train_df['SUBCLASS']

# Step 6: SMOTE 적용하여 클래스 불균형 해결
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 7: 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 8: XGBoost 모델 생성 (클래스 가중치 적용)
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, scale_pos_weight=1.5, random_state=42)
xgb_model.fit(X_train, y_train)

# Step 9: 모델 평가
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

# Step 10: Load test.csv for prediction
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 11: 'ID' 컬럼 제거 후 테스트 데이터 전처리
test_df = test_df.drop(columns=['ID'])
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# Step 12: 테스트 데이터에서도 상위 중요 피처만 선택
X_test_final = test_df[top_features]

# Step 13: Predict using the trained model
y_test_pred = xgb_model.predict(X_test_final)

# Step 14: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Step 15: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['Predicted_SUBCLASS']].head())


In [None]:
test_df[['ID', 'Predicted_SUBCLASS']].to_csv('submit03.csv',index=False)
test_df[['Predicted_SUBCLASS']].value_counts()

In [None]:
test_df[['Predicted_SUBCLASS']].value_counts()

In [27]:
test_df[['Predicted_SUBCLASS']].to_csv('submit03.csv')

In [None]:

root_test = pd.read_csv(test_path)
pd.concat(root_test['ID'],test_df[['Predicted_SUBCLASS']], axis=1)

In [None]:
# Load original test dataset to retrieve 'ID' column
root_test = pd.read_csv(test_path)

# Ensure 'ID' is preserved as a DataFrame
id_column = root_test[['ID']]

# Concatenate the 'ID' column with the predicted results
final_result = pd.concat([id_column, test_df[['Predicted_SUBCLASS']]], axis=1)

# Display the first few rows of the final result
print(final_result.head())


In [31]:
final_result.to_csv('Submit03.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로드 및 전처리
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: 'ID' 컬럼 제거 (학습에 불필요한 열)
train_df = train_df.drop(columns=['ID'])

# Step 2: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 3: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[1:]  # 'SUBCLASS' 이후 모든 열이 변이 정보
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 4: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 5: 상관계수 기반으로 특징 선택 (상관계수가 낮은 특징 제거)
correlation_matrix = abs(train_df.corr())
correlation_with_target = correlation_matrix['SUBCLASS'].drop('SUBCLASS')
top_features = correlation_with_target[correlation_with_target > 0.1].index

# 피처 수를 줄임: 상위 1000개 또는 500개 피처만 사용
top_n_features = 1000  # 조정 가능
top_features = correlation_with_target.sort_values(ascending=False).head(top_n_features).index

X = train_df[top_features]
y = train_df['SUBCLASS']

# Step 6: 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: XGBoost 모델 생성 및 하이퍼파라미터 튜닝
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)

# Step 8: 모델 평가
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with reduced features: {accuracy}")
print(classification_report(y_test, y_pred))

# 테스트 데이터에 대해 예측
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 9: 'ID' 컬럼 제거 후 테스트 데이터 전처리
test_df = test_df.drop(columns=['ID'])
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# 테스트 데이터에서도 상위 중요한 피처만 선택
X_test_final = test_df[top_features]

# Step 10: Predict using the trained model
y_test_pred = xgb_model.predict(X_test_final)

# Step 11: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Step 12: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['Predicted_SUBCLASS']].head())


In [None]:
test_df[['Predicted_SUBCLASS']].value_counts()

In [None]:
test_df[['ID','Predicted_SUBCLASS']].to_csv('submit04.csv', index=False)

In [None]:
# Load original test dataset to retrieve 'ID' column
root_test = pd.read_csv(test_path)

# Ensure 'ID' is preserved as a DataFrame
id_column = root_test[['ID']]

# Concatenate the 'ID' column with the predicted results
final_result = pd.concat([id_column, test_df[['Predicted_SUBCLASS']]], axis=1)

# Display the first few rows of the final result
print(final_result.head())


In [37]:
final_result[['ID','Predicted_SUBCLASS']].to_csv('submit04.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로드 및 전처리
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: 'ID' 컬럼 제거 (학습에 불필요한 열)
train_df = train_df.drop(columns=['ID'])

# Step 2: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 3: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[1:]  # 'SUBCLASS' 이후 모든 열이 변이 정보
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 4: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 5: 상관계수 기반으로 특징 선택 (상관계수가 낮은 특징 제거)
correlation_matrix = abs(train_df.corr())
correlation_with_target = correlation_matrix['SUBCLASS'].drop('SUBCLASS')
top_n_features = 2000  # 조정 가능
top_features = correlation_with_target.sort_values(ascending=False).head(top_n_features).index

X = train_df[top_features]
y = train_df['SUBCLASS']

# Step 6: 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: LightGBM 모델 생성 및 학습
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
lgb_model.fit(X_train, y_train)

# Step 8: 모델 평가
y_pred = lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with LightGBM: {accuracy}")
print(classification_report(y_test, y_pred))



In [None]:
# 테스트 데이터에 대해 예측
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 9: 'ID' 컬럼 제거 후 테스트 데이터 전처리
test_df = test_df.drop(columns=['ID'])
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# 테스트 데이터에서도 상위 중요한 피처만 선택
X_test_final = test_df[top_features]

# Step 10: Predict using the trained model
y_test_pred = lgb_model.predict(X_test_final)

# Step 11: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Step 12: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['Predicted_SUBCLASS']].head())


In [None]:
# Load original test dataset to retrieve 'ID' column
root_test = pd.read_csv(test_path)

# Ensure 'ID' is preserved as a DataFrame
id_column = root_test[['ID']]

# Concatenate the 'ID' column with the predicted results
final_result = pd.concat([id_column, test_df[['Predicted_SUBCLASS']]], axis=1)

# Display the first few rows of the final result
print(final_result.head())


In [42]:
final_result[['ID','Predicted_SUBCLASS']].to_csv('submit04.csv', index=False)

In [None]:
final_result['Predicted_SUBCLASS'].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로드 및 전처리
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: 'ID' 컬럼 제거 (학습에 불필요한 열)
train_df = train_df.drop(columns=['ID'])

# Step 2: Encode the 'SUBCLASS' column (target labels)
le = LabelEncoder()
train_df['SUBCLASS'] = le.fit_transform(train_df['SUBCLASS'])

# Step 3: Convert mutation columns from 'WT' and mutation strings to binary (0 for WT, 1 for mutation)
mutation_cols = train_df.columns[1:]  # 'SUBCLASS' 이후 모든 열이 변이 정보
train_df[mutation_cols] = train_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)

# Step 4: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])


# Step 5: 상관계수 기반으로 특징 선택 (상관계수가 낮은 특징 제거)
correlation_matrix = abs(train_df.corr())
correlation_with_target = correlation_matrix['SUBCLASS'].drop('SUBCLASS')
top_n_features = 2000  # 조정 가능
top_features = correlation_with_target.sort_values(ascending=False).head(top_n_features).index

X = train_df[top_features]
y = train_df['SUBCLASS']

# Step 6: 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: 하이퍼파라미터 튜닝을 위한 GridSearchCV 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터로 학습
best_xgb = grid_search.best_estimator_

# Step 8: 최적의 하이퍼파라미터로 학습된 모델 평가
y_pred = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


In [None]:

# Step 9: 테스트 데이터에 대한 예측
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 10: 'ID' 컬럼 제거 후 테스트 데이터 전처리
test_df = test_df.drop(columns=['ID'])
test_df[mutation_cols] = test_df[mutation_cols].applymap(lambda x: 0 if x == 'WT' else 1)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# 테스트 데이터에서도 상위 중요한 피처만 선택
X_test_final = test_df[top_features]

# Step 11: Predict using the best model
y_test_pred = best_xgb.predict(X_test_final)

# Step 12: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Step 13: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['Predicted_SUBCLASS']].head())


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로드 및 전처리
train_sample_path = './input/train.csv'
train_df = pd.read_csv(train_sample_path)

# Step 1: 'ID' 컬럼 제거 (학습에 불필요한 열)
train_df = train_df.drop(columns=['ID'])

# Step 2: Encode the 'SUBCLASS' column (target labels)
le_subclass = LabelEncoder()
train_df['SUBCLASS'] = le_subclass.fit_transform(train_df['SUBCLASS'])

# Step 3: 변이 값 처리 (WT는 0, 나머지는 LabelEncoder로 변환)
mutation_cols = train_df.columns[1:]  # 'SUBCLASS' 이후 모든 열이 변이 정보
le_mutations = LabelEncoder()

# WT는 0으로 변환하고, 나머지는 LabelEncoder로 변환
def encode_mutations(value):
    if value == 'WT':
        return 0
    else:
        return le_mutations.fit_transform([value])[0]

for col in mutation_cols:
    train_df[col] = train_df[col].apply(encode_mutations)

# Step 4: Standardize the mutation features
scaler = StandardScaler()
train_df[mutation_cols] = scaler.fit_transform(train_df[mutation_cols])

# Step 5: 상관계수 기반으로 특징 선택 (상관계수가 낮은 특징 제거)
correlation_matrix = abs(train_df.corr())
correlation_with_target = correlation_matrix['SUBCLASS'].drop('SUBCLASS')
top_n_features = 2000  # 조정 가능
top_features = correlation_with_target.sort_values(ascending=False).head(top_n_features).index

X = train_df[top_features]
y = train_df['SUBCLASS']

# Step 6: 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: 하이퍼파라미터 튜닝을 위한 GridSearchCV 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터로 학습
best_xgb = grid_search.best_estimator_

# Step 8: 최적의 하이퍼파라미터로 학습된 모델 평가
y_pred = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 729 candidates, totalling 2187 fits




Optimized Accuracy: 0.1200644641418211
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.00      0.00      0.00        27
           2       0.12      1.00      0.21       149
           3       0.00      0.00      0.00        31
           4       0.00      0.00      0.00        30
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00       116
           7       0.00      0.00      0.00        51
           8       0.00      0.00      0.00        86
           9       0.00      0.00      0.00        54
          10       0.00      0.00      0.00        39
          11       0.00      0.00      0.00        49
          12       0.00      0.00      0.00        34
          13       0.00      0.00      0.00        37
          14       0.00      0.00      0.00        39
          15       0.00      0.00      0.00        58
          16       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:

# Step 9: 테스트 데이터에 대한 예측
test_path = './input/test.csv'
test_df = pd.read_csv(test_path)

# Step 10: 'ID' 컬럼 제거 후 테스트 데이터 전처리
test_df = test_df.drop(columns=['ID'])
for col in mutation_cols:
    test_df[col] = test_df[col].apply(encode_mutations)
test_df[mutation_cols] = scaler.transform(test_df[mutation_cols])

# 테스트 데이터에서도 상위 중요한 피처만 선택
X_test_final = test_df[top_features]

# Step 11: Predict using the best model
y_test_pred = best_xgb.predict(X_test_final)

# Step 12: Decode the predicted class labels back to their original subclass names
y_test_pred_labels = le_subclass.inverse_transform(y_test_pred)

# Step 13: Store the predictions in the test DataFrame
test_df['Predicted_SUBCLASS'] = y_test_pred_labels

# Display the first few rows of predictions
print(test_df[['Predicted_SUBCLASS']].head())


  Predicted_SUBCLASS
0               BRCA
1               BRCA
2               BRCA
3               BRCA
4               BRCA


  test_df['Predicted_SUBCLASS'] = y_test_pred_labels


In [4]:
# Load original test dataset to retrieve 'ID' column
root_test = pd.read_csv(test_path)

# Ensure 'ID' is preserved as a DataFrame
id_column = root_test[['ID']]

# Concatenate the 'ID' column with the predicted results
final_result = pd.concat([id_column, test_df[['Predicted_SUBCLASS']]], axis=1)

# Display the first few rows of the final result
print(final_result.head())
final_result[['ID','Predicted_SUBCLASS']].to_csv('submit04.csv', index=False)

          ID Predicted_SUBCLASS
0  TEST_0000               BRCA
1  TEST_0001               BRCA
2  TEST_0002               BRCA
3  TEST_0003               BRCA
4  TEST_0004               BRCA


In [6]:
final_result[['Predicted_SUBCLASS']].value_counts()

Predicted_SUBCLASS
BRCA                  2546
Name: count, dtype: int64