In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# 데이터 CSV 파일 읽기
file_path = r"C:\Users\yale3\Downloads\car_evaluation.csv"
df = pd.read_csv(file_path)

# 데이터프레임 확인
display(df.head())
print(df.columns)


Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


Index(['vhigh', 'vhigh.1', '2', '2.1', 'small', 'low', 'unacc'], dtype='object')


In [3]:
# 결측치 확인
df.isnull().sum()

vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64

In [6]:
# 레이블 확인인
df['unacc'].value_counts()

unacc
unacc    1209
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [20]:
# 숫자로 보이지만 실제로는 문자열인 컬럼이 있는지 확인
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Label Encoding 적용
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # 나중에 역변환을 위해 저장

In [21]:
# 레이블 확인
display(df.head())

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,3,3,0,0,2,2,2
1,3,3,0,0,2,0,2
2,3,3,0,0,1,1,2
3,3,3,0,0,1,2,2
4,3,3,0,0,1,0,2


In [22]:
# 'Survived'를 예측하기 위한 특성과 레이블 분리
X = df.drop('unacc', axis=1)  # 특성 (X)
y = df['unacc']               # 레이블 (y)

# 데이터 나누기 (학습 80%, 테스트 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree 모델
print("Decision Tree 코드")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree 결과: Accuracy =", accuracy_score(y_test, y_pred_dt))
print("\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:")
cm_dt = confusion_matrix(y_test, y_pred_dt)
print(cm_dt)

Decision Tree 코드
Decision Tree 결과: Accuracy = 0.9710982658959537

               precision    recall  f1-score   support

           0       0.96      0.95      0.95        77
           1       0.75      0.80      0.77        15
           2       1.00      1.00      1.00       237
           3       0.83      0.88      0.86        17

    accuracy                           0.97       346
   macro avg       0.89      0.91      0.90       346
weighted avg       0.97      0.97      0.97       346

Confusion Matrix:
[[ 73   2   0   2]
 [  2  12   0   1]
 [  1   0 236   0]
 [  0   2   0  15]]


In [38]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# 데이터 분리
X = df.drop(columns=['unacc'])  # 타겟 컬럼 제거
y = df['unacc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 클래스 분포 확인
print("클래스 분포:\n", y_train.value_counts())

# 클래스 불균형 해결 (SMOTE 적용)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 데이터 정규화 (필요할 경우에만 적용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression 모델 학습
lr_model = LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced')
lr_model.fit(X_train_scaled, y_train_resampled)

# 예측 및 평가
y_pred_lr = lr_model.predict(X_test_scaled)

# 성능 평가
print("Logistic Regression 결과: Accuracy =", accuracy_score(y_test, y_pred_lr))
print("\n", classification_report(y_test, y_pred_lr, zero_division=1))  # zero_division 추가
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

클래스 분포:
 unacc
2    967
0    307
1     55
3     52
Name: count, dtype: int64
Logistic Regression 결과: Accuracy = 0.5289017341040463

               precision    recall  f1-score   support

           0       0.27      0.21      0.23        77
           1       0.07      0.29      0.11        14
           2       0.84      0.62      0.71       242
           3       0.27      1.00      0.43        13

    accuracy                           0.53       346
   macro avg       0.36      0.53      0.37       346
weighted avg       0.66      0.53      0.57       346

Confusion Matrix:
[[ 16  21  22  18]
 [  3   4   7   0]
 [ 41  34 150  17]
 [  0   0   0  13]]


In [25]:
from sklearn.svm import SVC

# Support Vector Machine 모델
print("SVM 코드")
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM 결과: Accuracy =", accuracy_score(y_test, y_pred_svm))
print("\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

SVM 코드
SVM 결과: Accuracy = 0.8988439306358381

               precision    recall  f1-score   support

           0       0.78      0.79      0.79        77
           1       1.00      0.13      0.24        15
           2       0.94      1.00      0.97       237
           3       0.86      0.71      0.77        17

    accuracy                           0.90       346
   macro avg       0.89      0.66      0.69       346
weighted avg       0.90      0.90      0.88       346

Confusion Matrix:
[[ 61   0  16   0]
 [ 11   2   0   2]
 [  1   0 236   0]
 [  5   0   0  12]]


In [39]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest 모델
print("Random Forest 코드")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest 결과: Accuracy =", accuracy_score(y_test, y_pred_rf))
print("\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)

Random Forest 코드
Random Forest 결과: Accuracy = 0.9826589595375722

               precision    recall  f1-score   support

           0       0.95      0.97      0.96        77
           1       1.00      0.93      0.96        14
           2       0.99      0.99      0.99       242
           3       1.00      0.92      0.96        13

    accuracy                           0.98       346
   macro avg       0.99      0.95      0.97       346
weighted avg       0.98      0.98      0.98       346

Confusion Matrix:
[[ 75   0   2   0]
 [  1  13   0   0]
 [  2   0 240   0]
 [  1   0   0  12]]


In [40]:
from sklearn.neighbors import KNeighborsClassifier

# 4데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN 모델 생성 및 학습
k = 3  # 최근접 이웃 개수
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred_knn = knn_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_knn)

print(f"KNN 결과: Accuracy = {accuracy:.4f}")
print("\n", classification_report(y_test, y_pred_knn))
print("Confusion Matrix:")
cm_knn = confusion_matrix(y_test, y_pred_knn)
print(cm_knn)

KNN 결과: Accuracy = 0.9364

               precision    recall  f1-score   support

           0       0.79      0.97      0.87        77
           1       1.00      0.64      0.78        14
           2       0.99      0.95      0.97       242
           3       1.00      0.77      0.87        13

    accuracy                           0.94       346
   macro avg       0.95      0.83      0.87       346
weighted avg       0.95      0.94      0.94       346

Confusion Matrix:
[[ 75   0   2   0]
 [  5   9   0   0]
 [ 12   0 230   0]
 [  3   0   0  10]]
