In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer

# 1. 데이터 불러오기
data = pd.read_csv('azdigar nav sartirovka+0 delete.csv')

# 분석할 열들
columns = [
    'w08chronic_a', 'w08chronic_b', 'w08chronic_c',
    'w08chronic_d', 'w08chronic_e', 'w08chronic_f',
    'w08chronic_g', 'w08chronic_h', 'w08chronic_i',
    'w08chronic_k', 'w08chronic_l', 'w08chronic_m'
] 

# 2. 데이터 클리닝 - 'w08chronic_m'에서 값이 3인 행 제거
cleaned_data = data[data['w08chronic_m'] != 3] 

# # 3. 피처와 타겟 정의
# X = cleaned_data.drop(columns=columns)
# y = cleaned_data[columns].values
# 3. 피처 데이터와 타겟 데이터 정의
X = cleaned_data.drop(columns=columns)
y_all = cleaned_data[columns]


# 4. 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. 멀티라벨 데이터를 멀티-핫 인코딩으로 변환
y = np.array(y_all.values)
y_binarized = np.where(y==5, 1, 0)

# 5. 유전 알고리즘 파라미터 설정
population_size = 100  # 개체군 크기
num_generations = 100  # 세대 수
mutation_rate = 0.01  # 돌연변이 확률

# 초기 개체군 생성
def initialize_population():
    return [np.random.randint(2, size=X_scaled.shape[1]).tolist() for _ in range(population_size)]

# 적합도 평가 함수
def fitness_function(individual, model, X_train, y_train, X_test, y_test):
    selected_features = [i for i, bit in enumerate(individual) if bit]
    if len(selected_features) == 0:
        return 0  # 피처가 선택되지 않으면 적합도는 0
    
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    
    try:
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        f1 = f1_score(y_test, y_pred, average='weighted')
        return f1  # 적합도는 F1 점수로 설정
    except Exception as e:
        print(f"적합도 평가 중 오류 발생: {e}")
        return 0
    

# 개체군 선택 함수
def selection(population, model, X_train, y_train, X_test, y_test):
    scores = [fitness_function(ind, model, X_train, y_train, X_test, y_test) for ind in population]
    selected_indices = np.argsort(scores)[-population_size // 2:]
    return [population[i] for i in selected_indices]

# 교차 함수
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 1)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2

# 돌연변이 함수
def mutate(individual):
    for i in range(len(individual)):
        if random.random() < mutation_rate:
            individual[i] = 1 - individual[i]  # 비트 반전
    return individual

# 유전 알고리즘 실행 함수
def run_genetic_algorithm(model, X_train, y_train, X_test, y_test):
    population = initialize_population()
    best_individual = None
    best_f1 = -1

    for generation in range(num_generations):
        print(f"Generation {generation + 1}/{num_generations}")
        selected_population = selection(population, model, X_train, y_train, X_test, y_test)
        
        children = []
        while len(children) < population_size - len(selected_population):
            parent1, parent2 = random.sample(selected_population, 2)
            child1, child2 = crossover(parent1, parent2)
            child1 = mutate(child1)
            child2 = mutate(child2)
            children.extend([child1, child2])
        
        population = selected_population + children[:population_size - len(selected_population)]
        
        for ind in population:
            current_f1 = fitness_function(ind, model, X_train, y_train, X_test, y_test)
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_individual = ind

    return best_individual

# 6. 결과 저장을 위한 빈 데이터프레임 생성
result_table = pd.DataFrame(columns=[
    'Target', 'Model', 'Precision', 'Recall', 'F1-Score', 
    'Accuracy'
])

# 전체 타겟에 대한 예측 결과를 저장할 리스트를 미리 선언
all_y_test = []
all_y_pred = []

# 7. 각 타겟 열을 사용한 학습 및 평가

# 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_binarized, test_size=0.2, random_state=42)

# 모델 정의
rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)

# 유전 알고리즘을 사용한 피처 선택
best_individual = run_genetic_algorithm(multi_target_rf, X_train, y_train, X_test, y_test)
selected_features = [i for i, bit in enumerate(best_individual) if bit]

# 선택된 피처만 사용
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]

# 모델 학습
multi_target_rf.fit(X_train_selected, y_train)

# 예측
y_pred = multi_target_rf.predict(X_test_selected)

for idx, target_column in enumerate(columns):
    # 각 열(클래스)에 대해 예측 값과 실제 값을 비교하여 성능 측정
    accuracy = accuracy_score(y_test[:, idx], y_pred[:, idx])
    precision, recall, f1, _ = precision_recall_fscore_support(y_test[:, idx], y_pred[:, idx], average='binary')

    new_row = pd.DataFrame({
        'Model': ['Random Forest'],
        'Target': [target_column],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1],
        'Accuracy': [accuracy],
    })
    result_table = pd.concat([result_table, new_row], ignore_index=True)

# 결과 테이블 출력
print("\n전체 모델 평가 결과:")
print(result_table)



# 전체 타겟에 대한 종합 F1 계산을 위해 모든 예측과 실제값을 한 열로 병합
all_y_test = np.concatenate([y_test[:, idx] for idx in range(y_test.shape[1])], axis=0)
all_y_pred = np.concatenate([y_pred[:, idx] for idx in range(y_pred.shape[1])], axis=0)

# 전체 타겟에 대한 평균 정확도 계산
average_accuracy = result_table['Accuracy'].mean()
print(f"\n전체 타겟에 대한 평균 정확도: {average_accuracy}")

# overall_accuracy = np.mean(np.all(y_test == y_pred, axis=1))
# print(f"\n전체 타겟에 대한 종합 정확도: {overall_accuracy}")
overall_micro_f1 = f1_score(all_y_test, all_y_pred, average='micro')
print(f"\n전체 타겟에 대한 종합 Micro F1 점수: {overall_micro_f1}")
overall_macro_f1 = f1_score(all_y_test, all_y_pred, average='macro')
print(f"\n전체 타겟에 대한 종합 Macro F1 점수: {overall_macro_f1}")
overall_weighted_f1 = f1_score(all_y_test, all_y_pred, average='weighted')
print(f"\n전체 타겟에 대한 종합 Weighted F1 점수: {overall_weighted_f1}")


Generation 1/5
Generation 2/5
Generation 3/5
Generation 4/5
Generation 5/5

전체 모델 평가 결과:
          Target          Model  Precision    Recall  F1-Score  Accuracy
0   w08chronic_a  Random Forest   0.800525  0.644820  0.714286  0.730684
1   w08chronic_b  Random Forest   0.773138  0.982783  0.865445  0.764901
2   w08chronic_c  Random Forest   0.927152  1.000000  0.962199  0.927152
3   w08chronic_d  Random Forest   0.970199  1.000000  0.984874  0.970199
4   w08chronic_e  Random Forest   0.971302  1.000000  0.985442  0.971302
5   w08chronic_f  Random Forest   0.894040  1.000000  0.944056  0.894040
6   w08chronic_g  Random Forest   0.945916  1.000000  0.972206  0.945916
7   w08chronic_h  Random Forest   0.950331  1.000000  0.974533  0.950331
8   w08chronic_i  Random Forest   0.777490  0.926040  0.845288  0.757174
9   w08chronic_k  Random Forest   0.985651  1.000000  0.992774  0.985651
10  w08chronic_l  Random Forest   0.986755  1.000000  0.993333  0.986755
11  w08chronic_m  Random Forest   0