In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score, roc_auc_score, make_scorer

In [2]:
dll = pd.read_csv('DLLs_Imported.csv')
dll = dll.iloc[:,1:]

In [13]:
X = dll.iloc[:,2:]
y = dll['Type']

In [14]:
X = X.to_numpy()
y = y.to_numpy()

In [19]:
np.unique(y)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
import numpy as np

# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 신경망 모델 정의
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 데이터셋 및 데이터로더 생성 함수
def create_dataloader(X, y, batch_size):
    dataset = CustomDataset(X, y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader

# GPU 사용 여부 확인 및 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 및 모델 설정

input_size = 628  # 입력 특성의 개수
hidden_size = 128  # 은닉층의 크기
output_size = len(np.unique(y))  # 출력의 크기 (이진 분류의 경우 2)
batch_size = 32  # 배치 크기

# Stratified K-Fold 교차 검증을 이용하여 데이터를 분할
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # X_train = X_train.to_numpy()
    # X_test = X_test.to_numpy()
    # y_train = y_train.to_numpy()
    # y_test = y_test.to_numpy()


    train_loader = create_dataloader(X_train, y_train, batch_size)
    val_loader = create_dataloader(X_val, y_val, batch_size)

    # 모델 초기화 및 GPU로 이동
    model = SimpleNN(input_size, hidden_size, output_size).to(device)

    # 손실 함수 및 옵티마이저 설정
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 학습
    num_epochs = 100
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # 데이터를 GPU로 이동
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}") if epoch%10 == 0 else None

    # 폴드별 검증 정확도 계산 등 추가 작업 수행 가능


Fold 1
Epoch 1, Loss: 1.139126307819646
Epoch 11, Loss: 0.8734306344190229
Epoch 21, Loss: 0.8632647809955173
Epoch 31, Loss: 0.8570050107016123
Epoch 41, Loss: 0.8538913385455135
Epoch 51, Loss: 0.8521940768443227
Epoch 61, Loss: 0.8505168033459998
Epoch 71, Loss: 0.8499151549658793
Epoch 81, Loss: 0.848803097695376
Epoch 91, Loss: 0.8478706127925106
Fold 2
Epoch 1, Loss: 1.1494617277542047
Epoch 11, Loss: 0.8718433653095127
Epoch 21, Loss: 0.8629931204689739
Epoch 31, Loss: 0.8573056002030933
Epoch 41, Loss: 0.8536831901588874
Epoch 51, Loss: 0.8512023060044507
Epoch 61, Loss: 0.8507606195363625
Epoch 71, Loss: 0.8482061929897171
Epoch 81, Loss: 0.849473301099917
Epoch 91, Loss: 0.8468996636597758
Fold 3
Epoch 1, Loss: 1.1471419566500503
Epoch 11, Loss: 0.8810268183499009
Epoch 21, Loss: 0.8679434070940566
Epoch 31, Loss: 0.8632686009675302
Epoch 41, Loss: 0.8600029456705872
Epoch 51, Loss: 0.8584673672952182
Epoch 61, Loss: 0.85689519780927
Epoch 71, Loss: 0.8546408981216692
Epoch 8

In [23]:
model.eval()  # 모델을 평가 모드로 설정
val_predictions = []
val_true_labels = []
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        val_predictions.extend(predicted.cpu().numpy())
        val_true_labels.extend(labels.cpu().numpy())
val_accuracy = accuracy_score(val_true_labels, val_predictions)
print(f"Fold {fold+1} Validation Accuracy: {val_accuracy}")

Fold 7 Validation Accuracy: 0.6165163739914571


In [7]:
from sklearn.ensemble import RandomForestClassifier
# StratifiedKFold 객체 생성
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# 교차 검증 및 평가
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf = RandomForestClassifier()

# Fit the model to the training data
    rf.fit(X_train, y_train)

# Predict on the test data
    y_pred = rf.predict(X_test)
    print(classification_report(y_test, y_pred))
# Evaluate the model
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
print(f"Accuracy: {max(scores)}")


              precision    recall  f1-score   support

           0       0.92      0.35      0.51       269
           1       0.88      0.70      0.78       717
           2       0.94      0.91      0.93       663
           3       0.90      0.42      0.57       708
           4       0.66      0.75      0.70       726
           5       1.00      0.12      0.21       603
           6       0.31      0.97      0.46       528

    accuracy                           0.62      4214
   macro avg       0.80      0.60      0.60      4214
weighted avg       0.80      0.62      0.62      4214

              precision    recall  f1-score   support

           0       0.95      0.36      0.52       268
           1       0.86      0.69      0.76       718
           2       0.94      0.91      0.92       663
           3       0.91      0.39      0.55       708
           4       0.65      0.73      0.69       725
           5       0.99      0.12      0.22       604
           6       0.31 

In [18]:
# StratifiedKFold 객체 생성
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증 및 평가
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # SVM 모델 생성 및 학습
    svm = SVC(kernel='linear', probability=True)
    svm.fit(X_train, y_train)
    
    # 테스트 데이터 예측 및 평가
    y_pred = svm.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
    # 분류 보고서 출력
    

# 교차 검증 결과 출력
print(f"Stratified k-fold cross-validation scores: {scores}")
print(f"Mean accuracy: {sum(scores) / len(scores)}")

              precision    recall  f1-score   support

           0       0.88      0.32      0.47       376
           1       0.86      0.65      0.74      1004
           2       0.92      0.89      0.91       929
           3       0.90      0.38      0.54       991
           4       0.61      0.75      0.68      1015
           5       0.31      0.89      0.46       845
           6       0.00      0.00      0.00       740

    accuracy                           0.59      5900
   macro avg       0.64      0.56      0.54      5900
weighted avg       0.65      0.59      0.57      5900



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      0.37      0.53       376
           1       0.87      0.68      0.76      1004
           2       0.91      0.91      0.91       929
           3       0.92      0.37      0.53       991
           4       0.64      0.79      0.71      1015
           5       0.32      0.92      0.48       845
           6       0.00      0.00      0.00       740

    accuracy                           0.61      5900
   macro avg       0.66      0.58      0.56      5900
weighted avg       0.67      0.61      0.59      5900

              precision    recall  f1-score   support

           0       0.89      0.30      0.45       375
           1       0.88      0.65      0.75      1005
           2       0.93      0.89      0.91       928
           3       0.91      0.38      0.54       992
           4       0.64      0.79      0.71      1015
           5       0.31      0.88      0.46       845
           6       0.37 

In [19]:
from sklearn.naive_bayes import MultinomialNB

# StratifiedKFold 객체 생성
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증 및 평가
nb_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    

    # Naive Bayes 모델 생성 및 학습
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    
    # 테스트 데이터 예측 및 평가

    nb_pred = nb.predict(X_test)
    nb_score = accuracy_score(y_test, nb_pred)
    nb_scores.append(nb_score)

# 분류 보고서 출력
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))

# 교차 검증 결과 출력
print(f"Naive Bayes: {nb_scores}")
print(f"\nMean accuracy:")
print(f"Naive Bayes: {sum(nb_scores) / len(nb_scores)}")

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.29      0.45       375
           1       0.76      0.64      0.70      1004
           2       0.78      0.84      0.81       929
           3       0.76      0.38      0.51       991
           4       0.59      0.63      0.61      1016
           5       0.95      0.09      0.16       844
           6       0.30      0.94      0.46       740

    accuracy                           0.56      5899
   macro avg       0.73      0.54      0.53      5899
weighted avg       0.72      0.56      0.54      5899

Naive Bayes: [0.5584745762711865, 0.5664406779661016, 0.5644067796610169, 0.5656890998474318, 0.5626377352093576]

Mean accuracy:
Naive Bayes: 0.5635297737910189


In [26]:
from sklearn.ensemble import VotingClassifier

# SVM, XGBoost 모델 객체 생성
svm = SVC()

# Voting 앙상블
ensemble = VotingClassifier(estimators=[('svm', svm), ('xgb', xgb_model)])

# 데이터로 앙상블 모델 학습
ensemble.fit(X_train, y_train)

# 예측
y_pred = ensemble.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



In [27]:
ensemble.score(X_test,y_test)

0.6168842176640108

In [34]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 데이터 준비
# X_train, X_test, y_train, y_test = ... # 실제 데이터로 대체

# 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = torch.sigmoid(out)
        return out

# 모델 인스턴스 생성
input_dim = X_train.shape[1]
model = MLP(input_dim, 64, 1)

# 손실 함수와 옵티마이저 정의
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# 모델 학습
num_epochs = 100
for epoch in range(num_epochs):
    inputs = torch.from_numpy(X_train.values).float()
    labels = torch.from_numpy(y_train_scaled).float().unsqueeze(1)  # 차원 추가

    # 모델 계산
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    # 역전파 및 가중치 업데이트
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # 진행 상황 출력
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.0918
Epoch [20/100], Loss: 0.0912
Epoch [30/100], Loss: 0.0909
Epoch [40/100], Loss: 0.0908
Epoch [50/100], Loss: 0.0908
Epoch [60/100], Loss: 0.0908
Epoch [70/100], Loss: 0.0908
Epoch [80/100], Loss: 0.0907
Epoch [90/100], Loss: 0.0907
Epoch [100/100], Loss: 0.0907


In [38]:
with torch.no_grad():
    inputs = torch.from_numpy(X_test.values).float()
    labels = torch.from_numpy(y_test.values).float()
    outputs = model(inputs)
    predictions = (outputs > 0.5).float()
    accuracy = (predictions == labels).float().mean()
    print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.1691


In [29]:
X_train.shape

(23599, 628)

In [6]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    tree_method='gpu_hist',
    sampling_method='gradient_based',
    device="gpu"
    )

In [7]:
params = {
    'max_depth':[3,5,7,9],
    'max_leaves':[32, 64, 128, 256],
    'learning_rate':[0.01, 0.05, 0.1, 0.3]    
}
gs = GridSearchCV(xgb_model, params, n_jobs=-1)

gs.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [8]:
dt = gs.best_estimator_
print(dt.score(X_train, y_train))


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




0.6281697780590336


In [9]:
print(gs.best_params_)

{'learning_rate': 0.3, 'max_depth': 9, 'max_leaves': 64}


In [12]:
y_pred = gs.predict(X_test)

In [13]:
accuracy = gs.score(X_test,y_test)
accuracy

0.6193898305084746

In [15]:
pe_h = pd.read_csv('PE_Header.csv')
pe_h = pe_h.iloc[:,1:]

X = pe_h.iloc[:,2:]
Y = pe_h['Type']
X_train,X_test,y_train,y_test = train_test_split(X, Y, stratify=Y, random_state=42, shuffle=True)


xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    tree_method='gpu_hist',
    sampling_method='gradient_based',
    device="gpu"
    )


params = {
    'max_depth':[3,5,7,9],
    'max_leaves':[32, 64, 128, 256],
    'learning_rate':[0.01, 0.05, 0.1, 0.3]    
}
gs_h = GridSearchCV(xgb_model, params, n_jobs=-1)

gs_h.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [16]:
dt = gs_h.best_estimator_
print(dt.score(X_train, y_train))
print(gs_h.best_params_)
print(gs_h.score(X_test,y_test))

0.9767389845672109
{'learning_rate': 0.3, 'max_depth': 9, 'max_leaves': 64}
0.8844605475040258



    E.g. tree_method = "hist", device = "cuda"



In [17]:
pe_s = pd.read_csv('PE_Section.csv')
pe_s = pe_s.iloc[:,1:]

X = pe_s.iloc[:,2:]
Y = pe_s['Type']
X_train,X_test,y_train,y_test = train_test_split(X, Y, stratify=Y, random_state=42, shuffle=True)


xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    tree_method='gpu_hist',
    sampling_method='gradient_based',
    device="gpu"
    )


params = {
    'max_depth':[3,5,7,9],
    'max_leaves':[32, 64, 128, 256],
    'learning_rate':[0.01, 0.05, 0.1, 0.3]    
}
gs_s = GridSearchCV(xgb_model, params, n_jobs=-1)

gs_s.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [18]:
dt = gs_s.best_estimator_
print(dt.score(X_train, y_train))
print(gs_s.best_params_)
print(gs_s.score(X_test,y_test))

0.9399641577060932
{'learning_rate': 0.3, 'max_depth': 9, 'max_leaves': 128}
0.8161290322580645



    E.g. tree_method = "hist", device = "cuda"



In [21]:
import joblib


# 모델 저장
joblib.dump(gs_h, 'gs_h.pkl')
joblib.dump(gs_s, 'gs_s.pkl')
joblib.dump(dll, 'dll.pkl')


['dll.pkl']

In [22]:
model = joblib.load('gs_s.pkl')
model.score(X_test,y_test)


    E.g. tree_method = "hist", device = "cuda"



0.8161290322580645

Improved

In [2]:
pe_h = pd.read_csv('PE_Header.csv')
pe_h = pe_h.iloc[:,1:]

X = pe_h.iloc[:,2:]
Y = pe_h['Type']
X_train,X_test,y_train,y_test = train_test_split(X, Y, stratify=Y, random_state=42, shuffle=True)


xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    tree_method='gpu_hist',
    sampling_method='gradient_based',
    device="gpu",
    # 최적 파라미터 반영
    learning_rate= 0.3, 
    max_depth= 9, 
    max_leaves= 64
    )

# min_child_weight and gamma.
params = {
    'gamma':[3,5,7,9,15,20]
}
gs_h = GridSearchCV(xgb_model, params, n_jobs=-1)

gs_h.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [3]:
dt = gs_h.best_estimator_
print(dt.score(X_train, y_train))
print(gs_h.best_params_)
print(gs_h.score(X_test,y_test))

0.8913442182956833
{'gamma': 3}
0.8666129898013956



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [4]:
pe_s = pd.read_csv('PE_Section.csv')
pe_s = pe_s.iloc[:,1:]

X = pe_s.iloc[:,2:]
Y = pe_s['Type']
X_train,X_test,y_train,y_test = train_test_split(X, Y, stratify=Y, random_state=42, shuffle=True)


xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    tree_method='gpu_hist',
    sampling_method='gradient_based',
    device="gpu",
    # 최적 파라미터 반영
    learning_rate= 0.3, 
    max_depth= 9, 
    max_leaves= 128
    )


params = {
    'gamma':[3,5,7,9,15,20]
}
gs_s = GridSearchCV(xgb_model, params, n_jobs=-1)

gs_s.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [5]:
dt = gs_s.best_estimator_
print(dt.score(X_train, y_train))
print(gs_s.best_params_)
print(gs_s.score(X_test,y_test))

0.8261200716845878
{'gamma': 3}
0.7935483870967742



    E.g. tree_method = "hist", device = "cuda"

