# 이진 분류 후 Multiclassification

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle
import optuna
from optuna.samplers import TPESampler
# from optuna.integration import CatBoostPruningCallback

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from catboost import Pool, CatBoostClassifier
# from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

# from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings(action='ignore')

pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_local_path(train_dataset_name, val_dataset_name, test_dataset_name):

    # 작업 디렉토리 origin_dir에 선언
    origin_dir = os.getcwd()

    # ".." : 상위 디렉토리로 이동
    os.chdir("..")

    # 상위 디렉토리에 fog_data가 없다면 fog_data 폴더 생성
    # (이미 폴더 있어도 exist_ok=True면 넘어감)
    os.makedirs("fog_data", exist_ok=True)

    # train/test 데이터셋 경로 잡아준다
    train_path = os.path.join(os.getcwd(), "fog_data", train_dataset_name)
    val_path = os.path.join(os.getcwd(), "fog_data", val_dataset_name)
    test_path = os.path.join(os.getcwd(), "fog_data", test_dataset_name)

    # 운영체제가 윈도우일 경우, "\\"를 "/"로 바꿔줘야 한다
    if os.name == "nt":
        train_path = train_path.replace("\\", "/")
        val_path = val_path.replace("\\", "/")
        test_path = test_path.replace("\\", "/")

    # origin_dir로 경로 다시 변경 (초기화)
    os.chdir(origin_dir)

    return train_path, val_path, test_path

In [3]:
# 전처리 중간에 nan 제거해주고 이상치 처리한 데이터프레임
train_path, val_path, test_path = get_local_path(
    # "fog_train_merged_std.csv",
    # "fog_val_merged_std.csv",
    # "fog_test_merged_std.csv"
    "fog_train_merged_month_stn_std.csv",
    "fog_val_merged_month_stn_std.csv",
    "fog_test_merged_month_stn_std.csv"
    )

## 1. 이진분류

In [4]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [5]:
# train, test 데이터셋에서 -99, -99.9 포함한 컬럼 동일하기 때문에 같이 변수로 선언
train_cols_na = train_df.columns[train_df.isna().any()].tolist()
val_cols_na = val_df.columns[val_df.isna().any()].tolist()
test_cols_na = test_df.columns[test_df.isna().any()].tolist()

## 1-1. 결측치 채우기

In [11]:
def fill_missing_values_with_monthly_mean(df, na_columns):

    df_filled = df.copy()

    # month, stn_id로 groupby 한 후 na_columns별로 각각 평균을 구한다
    grouped_means = df.groupby(['month', 'stn_id'])[na_columns].transform('mean')

    # na_columns별 결측치를 각 컬럼들의 결측치로 대체한다
    for column in na_columns:
        df_filled[column] = df_filled[column].fillna(grouped_means[column])

    return df_filled

In [12]:
train_df = fill_missing_values_with_monthly_mean(train_df, train_cols_na)
val_df = fill_missing_values_with_monthly_mean(val_df, val_cols_na)
test_df = fill_missing_values_with_monthly_mean(test_df, test_cols_na)

## 1-1. RandomForest 이진분류

### 1-1-1. train, val, test 데이터셋 분리

In [13]:
# target 컬럼
target_binary = "is_fog"
# # 모델 생성시 사용할 컬럼 지정
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10", "dew_reached", "ws10_deg"]

# 8방위 사용
used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
             "re", "ws10_ms", "sun10", "dew_reached", "t_td", "ws10_deg"]

# dew_reached 제거, 풍향 제거
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10"]

# 풍향 제거
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10", "dew_reached"]

X_train_binary = train_df[used_cols]
y_train_binary = train_df[target_binary]

X_val_binary = val_df[used_cols]
y_val_binary = val_df[target_binary]

X_test_binary = test_df[used_cols]

In [14]:
# target 컬럼
target = "class"
# # 모델 생성시 사용할 컬럼 지정
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10", "dew_reached", "ws10_deg"]

# 8방위 사용
used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
             "re", "ws10_ms", "sun10", "dew_reached",  "t_td", "ws10_dir"]

# dew_reached 제거, 풍향 제거
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10"]

# 풍향 제거
# used_cols = ["month", "time", "stn_id", "ta", "ts", "temp_diff", "hm",
#              "re", "ws10_ms", "sun10", "dew_reached"]

X_train = train_df[used_cols]
y_train = train_df[target]

X_val = val_df[used_cols]
y_val = val_df[target]

X_test = test_df[used_cols]

In [15]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

((2350456, 13), (783486, 13), (2350456,), (783486,), (262800, 13))

#### 1-2. CSI Index

In [16]:
# CSI 지수 함수 생성
def csi_index(y_true, y_pred):

    model_cm = confusion_matrix(y_true, y_pred)

    H = (model_cm[0][0] + model_cm[1][1] + model_cm[2][2])
    F = (model_cm[0][1] + model_cm[0][2] +
            model_cm[1][0] + model_cm[1][2] +
            model_cm[2][0] + model_cm[2][1] +
            model_cm[3][0] + model_cm[3][1] + model_cm[3][2])
    M = (model_cm[0][3] + model_cm[1][3] + model_cm[2][3])

    model_csi = H / (H + F + M)

    return model_csi

csi_scorer = make_scorer(csi_index, greater_is_better=True)

#### 1-1-3. Label Encoding

In [17]:
# stn_id를 각각 1 ~ 5로 정한 딕셔너리 생성
stn_id_map = {"A" : 1,
              "B" : 2,
              "C" : 3,
              "D" : 4,
              "E" : 5}

# stn_id_map을 바탕으로 stn_id 컬럼 mapping
X_train_binary["stn_id"] = X_train_binary["stn_id"].map(stn_id_map)
X_val_binary["stn_id"] = X_val_binary["stn_id"].map(stn_id_map)
X_test_binary["stn_id"] = X_test_binary["stn_id"].map(stn_id_map)

#### 1-3. 모델 학습

In [19]:
# stn_list=["A","B","C","D","E"]
stn_list = range(1, 6)

X_val_binary_sorted = X_val_binary.sort_values('stn_id')
y_val_binary_sorted = y_val_binary[X_val_binary.index]

y_pred = pd.DataFrame()

models = {i: RandomForestClassifier(random_state=42)  for i in stn_list}

for i in stn_list:

    now_idx_train = X_train_binary[X_train_binary['stn_id'] == i].index
    now_idx_val = X_val_binary_sorted[X_val_binary_sorted['stn_id'] == i].index

    now_idx_y_train = y_train_binary[X_train_binary['stn_id'] == i].index
    now_idx_y_val = y_val_binary_sorted[X_val_binary_sorted['stn_id'] == i].index

    models[i].fit(X_train_binary.loc[now_idx_train], y_train_binary.loc[now_idx_train])
    y_pred_now = models[i].predict(X_val_binary_sorted.loc[now_idx_val])

    # models[i].fit(now_idx_train, now_idx_y_train, cat_features=cat_features)
    # y_pred_now = models[i].predict(now_idx_val)

    y_pred = pd.concat([y_pred, pd.DataFrame(index=now_idx_val, data=y_pred_now)],
                    axis=0,
                    sort=False,
                    ignore_index=False)

    # csi = csi_index(y_val_binary_sorted.loc[now_idx_val], y_pred_now)
    # precision =  precision_score(y_val_binary_sorted.loc[now_idx_val], y_pred_now, average='binary', pos_label='1')
    cm = confusion_matrix(y_val_binary_sorted.loc[now_idx_val], y_pred_now)
    # print(f"\t{i} 학습 완료! 부분 CSI: {np.round(precision, 5)}")
    print(cm)

[[310839     91]
 [  1116    517]]
[[153040    289]
 [  1450   2025]]
[[78053     9]
 [  333   109]]
[[115782     80]
 [   899    855]]
[[117229     40]
 [   340    390]]


In [33]:
def binary_rf(X_train_binary, y_train_binary, X_val_binary, y_val_binary, X_test_binary):
    stn_list = range(1, 6)

    X_val_binary_sorted = X_val_binary.sort_values('stn_id')
    y_val_binary_sorted = y_val_binary.loc[X_val_binary_sorted.index]

    y_train_pred_df = pd.Series(index=X_train_binary.index, dtype=float)
    y_val_pred_df = pd.Series(index=X_val_binary.index, dtype=float)
    y_test_pred_df = pd.Series(index=X_test_binary.index, dtype=float)

    models = {i: RandomForestClassifier(random_state=42) for i in stn_list}

    for i in stn_list:
        now_idx_train = X_train_binary[X_train_binary['stn_id'] == i].index
        now_idx_val = X_val_binary_sorted[X_val_binary_sorted['stn_id'] == i].index
        now_idx_test = X_test_binary[X_test_binary['stn_id'] == i].index

        models[i].fit(X_train_binary.loc[now_idx_train], y_train_binary.loc[now_idx_train])

        y_train_pred_now = models[i].predict(X_train_binary.loc[now_idx_train])
        y_val_pred_now = models[i].predict(X_val_binary_sorted.loc[now_idx_val])
        y_test_pred_now = models[i].predict(X_test_binary.loc[now_idx_test])

        y_train_pred_df.loc[now_idx_train] = y_train_pred_now
        y_val_pred_df.loc[now_idx_val] = y_val_pred_now
        y_test_pred_df.loc[now_idx_test] = y_test_pred_now

    return y_train_pred_df, y_val_pred_df, y_test_pred_df

In [34]:
y_train_pred_rf, y_val_pred_rf, y_test_pred_rf = binary_rf(X_train_binary, y_train_binary, X_val_binary, y_val_binary, X_test_binary)

In [36]:
y_train_pred_rf.value_counts(normalize=True)

0.0    0.989758
1.0    0.010242
Name: proportion, dtype: float64

In [39]:
confusion_matrix(y_train_binary, y_train_pred_rf)

array([[2326345,      11],
       [     37,   24063]], dtype=int64)

In [37]:
y_val_pred_rf.value_counts(normalize=True)

0.0    0.994378
1.0    0.005622
Name: proportion, dtype: float64

In [40]:
confusion_matrix(y_val_binary, y_val_pred_rf)

array([[774943,    509],
       [  4138,   3896]], dtype=int64)

In [38]:
y_test_pred_rf.value_counts(normalize=True)

0.0    0.995944
1.0    0.004056
Name: proportion, dtype: float64

In [None]:
# save
with open('y_train_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_train_pred_rf, f)

with open('y_val_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_val_pred_rf, f)

with open('y_test_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_test_pred_rf, f)

In [41]:
# save
with open('y_train_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_train_pred_rf, f)

with open('y_val_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_val_pred_rf, f)

with open('y_test_pred_rf.pickle', 'wb') as f:
    pickle.dump(y_test_pred_rf, f)

In [42]:
# load
with open('y_train_pred_rf.pickle', 'rb') as f:
    y_pred_t = pickle.load(f)

# load
with open('y_val_pred_rf.pickle', 'rb') as f:
    y_pred_v = pickle.load(f)

# load
with open('y_test_pred_rf.pickle', 'rb') as f:
    y_pred_te = pickle.load(f)

In [109]:
y_train_pred_rf.isnull().sum()

0

In [110]:
y_val_pred_rf.isnull().sum()

0

In [111]:
y_test_pred_rf.isnull().sum()

0

#### classification

In [54]:
y_train_pred_rf = y_train_pred_rf.astype("int")
y_val_pred_rf = y_val_pred_rf.astype("int")
y_test_pred_rf = y_test_pred_rf.astype("int")

In [57]:
rf_y_train_pred_idx = y_train_pred_rf[y_train_pred_rf==1].index
rf_y_val_pred_idx = y_val_pred_rf[y_val_pred_rf==1].index
rf_y_test_pred_idx = y_test_pred_rf[y_test_pred_rf==1].index

In [67]:
X_train_cl = X_train_binary.loc[rf_y_train_pred_idx]
X_val_cl = X_val_binary.loc[rf_y_val_pred_idx]
X_test_cl = X_test_binary.loc[rf_y_test_pred_idx]

In [118]:
X_train_cl[X_train_cl["stn_id"]==3]

Unnamed: 0,month,time,stn_id,ta,ts,temp_diff,hm,re,ws10_ms,sun10,dew_reached,t_td,ws10_deg
679,7,19,3,20.0,21.2,-1.2,90.6,0.0,1.5,0.000000,0,1.535988,317.6
826,2,6,3,8.2,4.0,4.2,56.7,0.0,2.2,0.000000,0,7.815944,238.8
1006,11,7,3,11.1,9.0,2.1,87.0,0.0,2.1,0.000000,1,2.013352,229.8
2250,8,16,3,24.2,26.7,-2.5,89.1,0.0,2.8,0.040000,0,1.853493,323.6
3811,4,1,3,9.6,7.6,2.0,75.7,0.0,3.4,0.000000,0,3.943753,236.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245212,2,19,3,8.7,6.6,2.1,92.2,0.0,0.9,0.000000,1,1.155216,212.2
2245319,10,7,3,7.3,5.7,1.6,77.4,0.0,2.1,0.000000,0,3.566072,218.6
2286689,3,6,3,1.3,1.4,-0.1,93.4,1.0,0.0,0.102947,0,0.913775,0.0
2297166,2,15,3,2.8,6.0,-3.2,55.3,0.0,9.3,0.090583,0,7.795464,240.9


In [119]:
X_val_cl[X_val_cl["stn_id"]==3]

Unnamed: 0,month,time,stn_id,ta,ts,temp_diff,hm,re,ws10_ms,sun10,dew_reached,t_td,ws10_deg
7816,10,19,3,17.4,17.7,-0.3,85.4,0.0,0.9,0.00,0,2.397258,191.0
15485,12,5,3,-0.5,1.7,-2.2,95.2,1.0,1.2,0.00,0,0.649003,233.7
16504,12,1,3,0.5,1.7,-1.2,97.0,1.0,2.8,0.00,0,0.405744,318.3
16686,9,1,3,19.5,20.2,-0.7,89.0,0.0,0.7,0.00,0,1.804156,171.6
19470,2,16,3,9.1,8.4,0.7,87.7,1.0,0.5,0.03,0,1.867669,227.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
717415,9,2,3,12.9,12.8,0.1,84.2,0.0,1.4,0.00,0,2.517887,211.5
720373,9,16,3,18.5,20.7,-2.2,92.8,1.0,1.5,0.02,0,1.150710,313.7
720833,3,20,3,0.6,0.2,0.4,99.6,1.0,1.1,0.00,1,0.053516,29.3
736852,3,12,3,9.2,11.5,-2.3,95.6,1.0,2.2,0.03,0,0.644091,314.5


In [121]:
X_test_cl

Unnamed: 0,month,time,stn_id,ta,ts,temp_diff,hm,re,ws10_ms,sun10,dew_reached,t_td,ws10_deg
42,1,7,1,-7.3,-3.6,-3.7,95.7,0.0,0.7,0.00,0,0.546824,279.3
48,1,8,1,-7.9,-3.8,-4.1,96.7,0.0,0.5,0.00,0,0.415527,278.0
51,1,8,1,-7.9,-3.9,-4.0,96.1,0.0,0.6,0.02,0,0.492431,54.3
52,1,8,1,-7.7,-3.9,-3.8,95.7,0.0,0.5,0.03,0,0.544899,112.2
53,1,8,1,-7.7,-3.8,-3.9,95.5,0.0,0.5,0.06,0,0.570770,56.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173388,4,2,4,14.1,14.7,-0.6,92.3,0.0,3.6,0.00,0,1.191006,221.4
180868,6,0,4,19.0,20.3,-1.3,90.5,0.0,1.9,0.00,0,1.540904,100.0
186951,7,6,4,24.0,24.4,-0.4,96.6,1.0,4.4,0.00,0,0.557479,158.2
195834,9,23,4,21.4,20.2,1.2,81.9,0.0,1.5,0.00,0,3.121637,259.8


In [120]:
X_test_cl[X_test_cl["stn_id"]==3]

Unnamed: 0,month,time,stn_id,ta,ts,temp_diff,hm,re,ws10_ms,sun10,dew_reached,t_td,ws10_deg


In [114]:
y_train_cl = y_train.loc[rf_y_train_pred_idx]
y_val_cl = y_val.loc[rf_y_val_pred_idx]
# y_test_cl = y_test_binary.loc[rf_y_test_pred_idx]

In [99]:
def class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl):
    stn_list = range(1, 6)

    X_val_cl_sorted = X_val_cl.sort_values('stn_id')
    y_val_cl_sorted = y_val_cl.loc[X_val_cl_sorted.index]

    y_train_pred_df = pd.Series(index=X_train_cl.index, dtype=int)
    y_val_pred_df = pd.Series(index=X_val_cl.index, dtype=int)
    y_test_pred_df = pd.Series(index=X_test_cl.index, dtype=int)

    models = {i: RandomForestClassifier(random_state=42) for i in stn_list}

    for i in stn_list:
        now_idx_train = X_train_cl[X_train_cl['stn_id'] == i].index
        now_idx_val = X_val_cl_sorted[X_val_cl_sorted['stn_id'] == i].index
        now_idx_test = X_test_cl[X_test_cl['stn_id'] == i].index

        # Check if there are samples for the current stn_id
        if len(now_idx_train) == 0 or len(now_idx_val) == 0 or len(now_idx_test) == 0:
            print(f"No samples found for stn_id {i} in one of the datasets, skipping...")
            continue

        models[i].fit(X_train_cl.loc[now_idx_train], y_train_cl.loc[now_idx_train])

        y_train_pred_now = models[i].predict(X_train_cl.loc[now_idx_train])
        y_val_pred_now = models[i].predict(X_val_cl_sorted.loc[now_idx_val])
        y_test_pred_now = models[i].predict(X_test_cl.loc[now_idx_test])

        y_train_pred_df.loc[now_idx_train] = y_train_pred_now
        y_val_pred_df.loc[now_idx_val] = y_val_pred_now
        y_test_pred_df.loc[now_idx_test] = y_test_pred_now

    return y_train_pred_df, y_val_pred_df, y_test_pred_df

In [100]:
y_train_pred_cl, y_val_pred_cl, y_test_pred_cl = class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl)

No samples found for stn_id 3 in one of the datasets, skipping...
No samples found for stn_id 5 in one of the datasets, skipping...


In [103]:
y_train_cl

178        1
200        2
282        3
342        1
495        2
          ..
2348412    3
2348565    1
2349062    3
2349193    3
2350022    2
Name: class, Length: 24074, dtype: int64

In [108]:
y_train_pred_cl

178        1.0
200        2.0
282        3.0
342        NaN
495        2.0
          ... 
2348412    3.0
2348565    NaN
2349062    3.0
2349193    3.0
2350022    2.0
Length: 24074, dtype: float64

In [106]:
y_test_pred_cl

42        1.0
48        2.0
51        1.0
52        2.0
53        2.0
         ... 
173388    1.0
180868    3.0
186951    3.0
195834    2.0
195838    2.0
Length: 1066, dtype: float64

In [94]:
def class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl):
    stn_list = range(1, 6)

    X_val_cl_sorted = X_val_cl.sort_values('stn_id')
    y_val_cl_sorted = y_val_cl.loc[X_val_cl_sorted.index]

    y_train_pred_df = pd.Series(index=X_train_cl.index, dtype=int)
    y_val_pred_df = pd.Series(index=X_val_cl.index, dtype=int)
    y_test_pred_df = pd.Series(index=X_test_cl.index, dtype=int)

    models = {i: RandomForestClassifier(random_state=42) for i in stn_list}

    for i in stn_list:
        now_idx_train = X_train_cl[X_train_cl['stn_id'] == i].index
        now_idx_val = X_val_cl_sorted[X_val_cl_sorted['stn_id'] == i].index
        now_idx_test = X_test_cl[X_test_cl['stn_id'] == i].index

        # Check if there are samples for the current stn_id
        if len(now_idx_train) == 0 or len(now_idx_val) == 0 or len(now_idx_test) == 0:
            print(f"No samples found for stn_id {i} in one of the datasets, skipping...")
            continue

        models[i].fit(X_train_cl.loc[now_idx_train], y_train_cl.loc[now_idx_train])

        y_train_pred_now = models[i].predict(X_train_cl.loc[now_idx_train])
        y_val_pred_now = models[i].predict(X_val_cl_sorted.loc[now_idx_val])
        y_test_pred_now = models[i].predict(X_test_cl.loc[now_idx_test])

        y_train_pred_df.loc[now_idx_train] = y_train_pred_now
        y_val_pred_df.loc[now_idx_val] = y_val_pred_now
        y_test_pred_df.loc[now_idx_test] = y_test_pred_now

    return y_train_pred_df, y_val_pred_df, y_test_pred_df

In [95]:
y_train_pred_cl, y_val_pred_cl, y_test_pred_cl = class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl)

No samples found for stn_id 3 in one of the datasets, skipping...
No samples found for stn_id 5 in one of the datasets, skipping...


In [116]:
X_train_cl

Unnamed: 0,month,time,stn_id,ta,ts,temp_diff,hm,re,ws10_ms,sun10,dew_reached,t_td,ws10_deg
178,2,7,1,5.000000,1.200000,3.8,98.3,0.000000,0.6,0.000000,1,0.237423,268.5
200,12,6,2,-1.800000,-0.100000,-1.7,92.5,0.000000,0.0,0.000000,0,1.015562,0.0
282,10,4,1,19.100000,19.200000,-0.1,98.8,0.000000,0.0,0.000000,0,0.187512,0.0
342,7,4,5,23.400000,23.800000,-0.4,99.9,0.000000,2.0,0.000000,0,0.016083,40.6
495,8,3,2,16.200000,19.100000,-2.9,98.3,0.000000,0.8,0.000000,0,0.260173,317.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2348412,1,20,2,2.200000,0.600000,1.6,95.5,0.034447,0.3,0.000000,1,0.621706,135.6
2348565,6,3,5,22.500000,22.900000,-0.4,99.9,0.000000,3.4,0.136851,0,0.015971,216.8
2349062,12,8,2,-18.200000,0.783418,81.7,77.5,0.000000,0.0,0.010000,1,2.842773,0.0
2349193,11,20,4,10.300000,10.400000,-0.1,97.7,0.000000,0.0,0.064884,0,0.336513,0.0


In [96]:
y_train_pred_cl

178        1.0
200        2.0
282        3.0
342        NaN
495        2.0
          ... 
2348412    3.0
2348565    NaN
2349062    3.0
2349193    3.0
2350022    2.0
Length: 24074, dtype: float64

In [122]:
# stn_id별로 모델 X

def class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl):
    # Initialize RandomForestClassifier
    model = RandomForestClassifier(random_state=42)

    # Fit the model on the entire training data
    model.fit(X_train_cl, y_train_cl)

    # Predict on the training data
    y_train_pred = model.predict(X_train_cl)
    
    # Predict on the validation data
    y_val_pred = model.predict(X_val_cl)
    
    # Predict on the test data
    y_test_pred = model.predict(X_test_cl)

    # Convert predictions to Series with the original index
    y_train_pred_df = pd.Series(y_train_pred, index=X_train_cl.index)
    y_val_pred_df = pd.Series(y_val_pred, index=X_val_cl.index)
    y_test_pred_df = pd.Series(y_test_pred, index=X_test_cl.index)

    return y_train_pred_df, y_val_pred_df, y_test_pred_df

In [123]:
y_train_pred_cl, y_val_pred_cl, y_test_pred_cl = class_rf(X_train_cl, y_train_cl, X_val_cl, y_val_cl, X_test_cl)

In [130]:
y_val_pred_cl.isnull().sum()

0

In [131]:
y_test_pred_cl.isnull().sum()

0

In [132]:
rf_y_val_pred_idx_zero = y_val_pred_rf[y_val_pred_rf==0].index
rf_y_test_pred_idx_zero = y_test_pred_rf[y_test_pred_rf==0].index

In [139]:
y_val_pred_cl

50        2
287       1
324       3
345       2
639       1
         ..
779928    3
780608    1
780901    2
781445    2
781555    3
Length: 4405, dtype: int64

In [138]:
rf_y_val_pred_idx_zero

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       783476, 783477, 783478, 783479, 783480, 783481, 783482, 783483, 783484,
       783485],
      dtype='int64', length=779081)

In [144]:
y_val_cl_zero = y_val.loc[rf_y_val_pred_idx_zero]

In [145]:
y_val_cl_zero

0         4
1         4
2         4
3         4
4         4
         ..
783481    4
783482    4
783483    4
783484    4
783485    4
Name: class, Length: 779081, dtype: int64

In [148]:
y_val_final = pd.concat([y_val_cl_zero, y_val_pred_cl], axis=0).sort_index()

In [149]:
y_val_final

0         4
1         4
2         4
3         4
4         4
         ..
783481    4
783482    4
783483    4
783484    4
783485    4
Length: 783486, dtype: int64

In [150]:
y_val

0         4
1         4
2         4
3         4
4         4
         ..
783481    4
783482    4
783483    4
783484    4
783485    4
Name: class, Length: 783486, dtype: int64

In [151]:
csi_index(y_val,y_val_final)

0.8024113309141988

In [153]:
confusion_matrix(y_val, y_val_final)

array([[  1607,    330,     30,      0],
       [   248,   2542,    232,      0],
       [    33,    306,   2706,      0],
       [    59,    193,    257, 774943]], dtype=int64)