In [None]:
# ライブラリのインポート
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

# データベース接続
DATABASE_URL = os.environ.get('DATABASE_URL', 'postgresql://user:password@postgres:5432/mlapp')
engine = create_engine(DATABASE_URL)

In [None]:
# データの読み込み
df = pd.read_sql('SELECT * FROM iris_data', engine)
print(f"データセットの形状: {df.shape}")
print("\nデータの最初の5行:")
df.head()

In [None]:
# セル3: 特徴量とターゲットの分離
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[features]
y = df['target']

print("特徴量の形状:", X.shape)
print("ターゲットの分布:")
print(y.value_counts().sort_index())

In [None]:
# データの分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"訓練データ: {X_train.shape}")
print(f"テストデータ: {X_test.shape}")

In [None]:
# LightGBMモデルの学習
# LightGBMのパラメータ設定
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'random_state': 42
}

# データセットの作成
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# モデルの学習
model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(10), lgb.log_evaluation(0)]
)

print("モデルの学習が完了しました")

In [None]:
# モデルの評価
# 予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_class = np.argmax(y_pred, axis=1)

# 精度の計算
accuracy = accuracy_score(y_test, y_pred_class)
print(f"テストデータの精度: {accuracy:.4f}")

# 分類レポート
print("\n分類レポート:")
target_names = ['setosa', 'versicolor', 'virginica']
print(classification_report(y_test, y_pred_class, target_names=target_names))

In [None]:
# 混同行列の可視化
cm = confusion_matrix(y_test, y_pred_class)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, 
            yticklabels=target_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# 特徴量の重要度
importance = model.feature_importance(importance_type='gain')
feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.show()

print("特徴量の重要度:")
print(feature_importance_df)

In [None]:
# モデルの保存
model_path = '/workspace/models/iris_lgb_model.pkl'
os.makedirs('/workspace/models', exist_ok=True)

joblib.dump(model, model_path)
print(f"モデルを保存しました: {model_path}")

# モデル情報も保存
model_info = {
    'features': features,
    'target_names': target_names,
    'accuracy': accuracy,
    'model_path': model_path
}
joblib.dump(model_info, '/workspace/models/model_info.pkl')
print("モデル情報を保存しました")

In [None]:
# 予測関数のテスト
def predict_iris(sepal_length, sepal_width, petal_length, petal_width):
    """
    Iris の予測を行う関数
    """
    # 入力データの作成
    input_data = pd.DataFrame({
        'sepal_length': [sepal_length],
        'sepal_width': [sepal_width],
        'petal_length': [petal_length],
        'petal_width': [petal_width]
    })
    
    # 予測
    pred_proba = model.predict(input_data, num_iteration=model.best_iteration)
    pred_class = np.argmax(pred_proba, axis=1)[0]
    confidence = np.max(pred_proba)
    
    return {
        'prediction': int(pred_class),
        'prediction_name': target_names[pred_class],
        'confidence': float(confidence),
        'probabilities': {
            target_names[i]: float(prob) 
            for i, prob in enumerate(pred_proba[0])
        }
    }

# テスト
test_result = predict_iris(5.1, 3.5, 1.4, 0.2)
print("予測結果のテスト:")
print(test_result)