In [1]:
# 필요한 라이브러리 임포트
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 데이터 불러오기
machine_data = pd.read_csv('machine.data_update.csv')

# 필요 없는 열 제거
machine_data_processed = machine_data.drop(['VendorName', 'ModelName', 'ERP'], axis=1)

# 데이터 탐색 및 시각화
plt.figure(figsize=(12, 8))
correlation_matrix = machine_data_processed.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of CPU Performance Data')
plt.show()

# 데이터 스케일링
scaler = StandardScaler()
scaled_features = scaler.fit_transform(machine_data_processed.drop('PRP', axis=1))

# 스케일된 데이터프레임 생성
scaled_features_df = pd.DataFrame(scaled_features, columns=machine_data_processed.columns[:-1])
scaled_features_df['PRP'] = machine_data_processed['PRP']

# 데이터 분할
X = scaled_features_df.drop('PRP', axis=1)
y = scaled_features_df['PRP']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3차 과제의 결과 계산
# 다중 선형 회귀 모델 훈련
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# 학습 데이터 평가 (3차 과제)
y_train_pred_linear = linear_model.predict(X_train)
train_mse_linear = mean_squared_error(y_train, y_train_pred_linear)
train_mae_linear = mean_absolute_error(y_train, y_train_pred_linear)
train_r2_linear = r2_score(y_train, y_train_pred_linear)

# 테스트 데이터 평가 (3차 과제)
y_test_pred_linear = linear_model.predict(X_test)
test_mse_linear = mean_squared_error(y_test, y_test_pred_linear)
test_mae_linear = mean_absolute_error(y_test, y_test_pred_linear)
test_r2_linear = r2_score(y_test, y_test_pred_linear)

# 교차 검증 (3차 과제)
cv_scores_linear = cross_val_score(linear_model, X_train, y_train, cv=5)

# 파이프라인 생성 (4차 과제)
pipeline = Pipeline([
    ('poly_features', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# 하이퍼파라미터 튜닝
param_grid = {
    'poly_features__degree': [1, 2, 3],
    'linear_regression': [LinearRegression(), Ridge(), Lasso()],
    'linear_regression__alpha': [0.1, 1.0, 10.0] if isinstance(pipeline.named_steps['linear_regression'], (Ridge, Lasso)) else [0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# 최적의 모델 (4차 과제)
best_model = grid_search.best_estimator_

# 학습 데이터 평가 (4차 과제)
y_train_pred_best = best_model.predict(X_train)
train_mse_best = mean_squared_error(y_train, y_train_pred_best)
train_mae_best = mean_absolute_error(y_train, y_train_pred_best)
train_r2_best = r2_score(y_train, y_train_pred_best)

# 테스트 데이터 평가 (4차 과제)
y_test_pred_best = best_model.predict(X_test)
test_mse_best = mean_squared_error(y_test, y_test_pred_best)
test_mae_best = mean_absolute_error(y_test, y_test_pred_best)
test_r2_best = r2_score(y_test, y_test_pred_best)

# 교차 검증 (4차 과제)
cv_scores_best = cross_val_score(best_model, X_train, y_train, cv=5)

# 평가 결과 출력
print(f'3차 과제 - Linear Regression:')
print(f'Train MSE: {train_mse_linear:.4f}')
print(f'Train MAE: {train_mae_linear:.4f}')
print(f'Train R^2: {train_r2_linear:.4f}')
print(f'Test MSE: {test_mse_linear:.4f}')
print(f'Test MAE: {test_mae_linear:.4f}')
print(f'Test R^2: {test_r2_linear:.4f}')
print(f'Cross-validation Mean Score: {cv_scores_linear.mean():.4f}')

print(f'4차 과제 - Pipeline Best Model:')
print(f'Best Model: {grid_search.best_params_}')
print(f'Train MSE: {train_mse_best:.4f}')
print(f'Train MAE: {train_mae_best:.4f}')
print(f'Train R^2: {train_r2_best:.4f}')
print(f'Test MSE: {test_mse_best:.4f}')
print(f'Test MAE: {test_mae_best:.4f}')
print(f'Test R^2: {test_r2_best:.4f}')
print(f'Cross-validation Mean Score: {cv_scores_best.mean():.4f}')

# 모델 성능 비교를 위한 데이터프레임 생성
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression (Basic)', 'Pipeline (Best)'],
    'Train MSE': [train_mse_linear, train_mse_best],
    'Test MSE': [test_mse_linear, test_mse_best],
    'Train MAE': [train_mae_linear, train_mae_best],
    'Test MAE': [test_mae_linear, test_mae_best],
    'Train R^2': [train_r2_linear, train_r2_best],
    'Test R^2': [test_r2_linear, test_r2_best]
})

# 바 차트를 통한 성능 비교 시각화
comparison_df.set_index('Model').plot(kind='bar', figsize=(12, 8), rot=0)
plt.title('Model Performance Comparison')
plt.ylabel('Scores')
plt.show()


ModuleNotFoundError: No module named 'seaborn'