In [21]:
# 라이브러리 로딩
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# -----------------------------
# 1) 데이터 준비(화이트 와인 데이터)
# -----------------------------

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/기계학습프로그래밍/4주/winequality-white.csv", sep=";").dropna()
print(df.head())

# quality만 땜
X = df.drop("quality", axis=1)
y_raw = df["quality"]

# 퀄리티가 3부터 시작이라 0부터 시작하는 걸로 맞춰주기 위해 라벨링)
le = LabelEncoder()
y = le.fit_transform(y_raw)

print("원본 quality 값:", sorted(y_raw.unique()))
print("인코딩 후 라벨:", np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# knn, 선형 회귀 성능 향상을 위한 스케일
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6 

In [24]:
# 1. 결정트리
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

# 2. 랜덤 포레스트
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 3. KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)

In [25]:
# 결과 평가
print("결정트리 정확도:", accuracy_score(y_test, dt_pred))
print("랜덤포레스트 정확도:", accuracy_score(y_test, rf_pred))
print("KNN 정확도:", accuracy_score(y_test, knn_pred))

결정트리 정확도: 0.6061224489795919
랜덤포레스트 정확도: 0.689795918367347
KNN 정확도: 0.5428571428571428


In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 선형 회귀 추가
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# 예측
y_pred = lin_reg.predict(X_test)

# 평가
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("선형 회귀 평가")
print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)

# 가중치(a)와 절편(b)
print("가중치 (a):", lin_reg.coef_)
print("절편 (b):", lin_reg.intercept_)

선형 회귀 평가
MSE: 0.5690247717229262
RMSE: 0.754337306331144
MAE: 0.5862665383250466
R²: 0.2652750042179145
가중치 (a): [ 4.59072370e-02 -1.91488432e+00 -6.13034698e-02  7.12395507e-02
 -2.64751329e-02  5.11945372e-03 -2.42153962e-04 -1.24264125e+02
  6.00699854e-01  6.49072780e-01  2.29008684e-01]
절편 (b): 121.39391498452245
