In [None]:
import pandas as pd
import numpy as np
import os
import random
!pip install optuna

from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
import optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Seed 설정
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
# 데이터 로드
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/test.csv')

In [None]:
# 이상치 처리: 중앙값으로 치환
median_pressure_train = train['pressure'].median()
median_pressure_test = test['pressure'].median()

train.loc[train['pressure'] <= 850, 'pressure'] = median_pressure_train
test.loc[test['pressure'] <= 850, 'pressure'] = median_pressure_test

train.loc[train['pressure'] >= 1100, 'pressure'] = median_pressure_train
test.loc[test['pressure'] >= 1200, 'pressure'] = median_pressure_test

In [None]:
# 대기 밀도 계산
def air_density(pressure, temperature, humidity):
    R = 287.05  # specific gas constant in J/(kg·K)
    T = temperature + 273.15  # Convert temperature from Celsius to Kelvin
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    pv = e * 100  # vapour partial pressure in Pa
    pd = pressure * 100 - pv  # dry partial pressure in Pa
    rho = (pd / (R * T)) + (pv / (R * 461.495))  # total air density in kg/m^3
    return rho

# 절대 습도 계산
def calculate_absolute_humidity(temperature, pressure, humidity):
    T = temperature + 273.15  # Convert temperature from Celsius to Kelvin
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    AH = 216.7 * (e / T)
    return AH

# 공기 분압 계산
def calculate_air_pressure(pressure, temperature, humidity):
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    pv = e * 100  # vapour partial pressure in Pa
    return pv

# 대기 밀도, 절대 습도, 공기 분압 값을 데이터 프레임에 추가
train['air_density'] = air_density(train['pressure'], train['temperature'], train['humidity'])
test['air_density'] = air_density(test['pressure'], test['temperature'], test['humidity'])

train['absolute_humidity'] = calculate_absolute_humidity(train['temperature'], train['pressure'], train['humidity'])
test['absolute_humidity'] = calculate_absolute_humidity(test['temperature'], test['pressure'], test['humidity'])

train['air_pressure'] = calculate_air_pressure(train['pressure'], train['temperature'], train['humidity'])
test['air_pressure'] = calculate_air_pressure(test['pressure'], test['temperature'], test['humidity'])


# 독립변수와 종속변수 분리
train_x = train.drop(columns=['id', 'target'])
train_y = train['target']
test_x = test.drop(columns=['id'])

# 레이블 인코딩
le = LabelEncoder()
train_x['snowing'] = le.fit_transform(train_x['snowing'])

for label in np.unique(test_x['snowing']):
    if label not in le.classes_:
        le.classes_ = np.append(le.classes_, label)
test_x['snowing'] = le.transform(test_x['snowing'])

# 스케일링
scaler = MaxAbsScaler()
transformed_train_X = scaler.fit_transform(train_x)
transformed_test_X = scaler.transform(test_x)


In [None]:
# Hyperparameter Optimization with Cross-validation
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 10),
        'max_depth': trial.suggest_int('max_depth',20, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2),
        #'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        #'max_features': trial.suggest_categorical('max_features', [None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [False])
    }

    model = ExtraTreesRegressor(**params, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []
    for train_idx, valid_idx in kf.split(transformed_train_X):
        X_train_fold, X_valid_fold = transformed_train_X[train_idx], transformed_train_X[valid_idx]
        y_train_fold, y_valid_fold = train_y.iloc[train_idx], train_y.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_valid_fold)
        mae_scores.append(mean_absolute_error(y_valid_fold, preds))

    return np.mean(mae_scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)  # 줄어든 n_trials로 변경

# Best Model
params = study.best_params
model = ExtraTreesRegressor(**params, random_state=42)
model.fit(transformed_train_X, train['target'])
preds = model.predict(transformed_test_X)

In [None]:
# Submission
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/sample_submission.csv')
submit['target'] = preds
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/python3/yangjinyeong.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import os
import random
!pip install optuna

from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
import optuna

from google.colab import drive
drive.mount('/content/drive/')

# Seed 설정
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# 데이터 로드
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/test.csv')

# 이상치 처리: 중앙값으로 치환
median_pressure_train = train['pressure'].median()
median_pressure_test = test['pressure'].median()

train.loc[train['pressure'] <= 850, 'pressure'] = median_pressure_train
test.loc[test['pressure'] <= 850, 'pressure'] = median_pressure_test

train.loc[train['pressure'] >= 1100, 'pressure'] = median_pressure_train
test.loc[test['pressure'] >= 1200, 'pressure'] = median_pressure_test


# 대기 밀도 계산
def air_density(pressure, temperature, humidity):
    R = 287.05  # specific gas constant in J/(kg·K)
    T = temperature + 273.15  # Convert temperature from Celsius to Kelvin
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    pv = e * 100  # vapour partial pressure in Pa
    pd = pressure * 100 - pv  # dry partial pressure in Pa
    rho = (pd / (R * T)) + (pv / (R * 461.495))  # total air density in kg/m^3
    return rho

# 절대 습도 계산
def calculate_absolute_humidity(temperature, pressure, humidity):
    T = temperature + 273.15  # Convert temperature from Celsius to Kelvin
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    AH = 216.7 * (e / T)
    return AH

# 공기 분압 계산
def calculate_air_pressure(pressure, temperature, humidity):
    e = (humidity/100) * 6.105 * np.exp((17.27 * temperature) / (237.7 + temperature))  # vapour pressure
    pv = e * 100  # vapour partial pressure in Pa
    return pv

# 대기 밀도, 절대 습도, 공기 분압 값을 데이터 프레임에 추가
train['air_density'] = air_density(train['pressure'], train['temperature'], train['humidity'])
test['air_density'] = air_density(test['pressure'], test['temperature'], test['humidity'])

train['absolute_humidity'] = calculate_absolute_humidity(train['temperature'], train['pressure'], train['humidity'])
test['absolute_humidity'] = calculate_absolute_humidity(test['temperature'], test['pressure'], test['humidity'])

train['air_pressure'] = calculate_air_pressure(train['pressure'], train['temperature'], train['humidity'])
test['air_pressure'] = calculate_air_pressure(test['pressure'], test['temperature'], test['humidity'])


# 독립변수와 종속변수 분리
train_x = train.drop(columns=['id', 'target'])
train_y = train['target']
test_x = test.drop(columns=['id'])

# 레이블 인코딩
le = LabelEncoder()
train_x['snowing'] = le.fit_transform(train_x['snowing'])

for label in np.unique(test_x['snowing']):
    if label not in le.classes_:
        le.classes_ = np.append(le.classes_, label)
test_x['snowing'] = le.transform(test_x['snowing'])

# 스케일링
scaler = MaxAbsScaler()
transformed_train_X = scaler.fit_transform(train_x)
transformed_test_X = scaler.transform(test_x)

# Hyperparameter Optimization with Cross-validation
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 10),
        'max_depth': trial.suggest_int('max_depth',20, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 2),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'max_features': trial.suggest_categorical('max_features', [None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [False])
    }

    model = ExtraTreesRegressor(**params, random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []
    for train_idx, valid_idx in kf.split(transformed_train_X):
        X_train_fold, X_valid_fold = transformed_train_X[train_idx], transformed_train_X[valid_idx]
        y_train_fold, y_valid_fold = train_y.iloc[train_idx], train_y.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_valid_fold)
        mae_scores.append(mean_absolute_error(y_valid_fold, preds))

    return np.mean(mae_scores)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150,n_jobs=-1)  # 줄어든 n_trials로 변경

# Best Model
params = study.best_params
model = ExtraTreesRegressor(**params, random_state=42)
model.fit(transformed_train_X, train['target'])
preds = model.predict(transformed_test_X)

# Submission
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/python3/sample_submission.csv')
submit['target'] = preds
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/python3/yangjinyeong.csv', index=False)




[I 2023-09-02 11:52:03,548] A new study created in memory with name: no-name-079eb4f2-3f4d-48c7-9ce8-b728ec421038


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


[I 2023-09-02 11:52:44,732] Trial 7 finished with value: 1.911923065213528 and parameters: {'n_estimators': 120, 'max_depth': 48, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': False}. Best is trial 7 with value: 1.911923065213528.
[I 2023-09-02 11:53:20,385] Trial 0 finished with value: 2.010636665227877 and parameters: {'n_estimators': 380, 'max_depth': 26, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': False}. Best is trial 7 with value: 1.911923065213528.
[I 2023-09-02 11:53:26,390] Trial 5 finished with value: 2.0518491244015538 and parameters: {'n_estimators': 450, 'max_depth': 33, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': False}. Best is trial 7 with value: 1.911923065213528.
[I 2023-09-02 11:53:36,400] Trial 4 finished with value: 1.9400258474398455 and parameters: {'n_estimators': 350, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None