# Proposed idea2
1. Preprocess `X`
2. Label propagation unlabeled `y(=9999999)` using `OPTICS`

# Import packages

In [1]:
from analysis_tools.common import *

%load_ext autoreload
%autoreload 2

np.random.seed(RANDOM_STATE)

# 2017년 데이터

# 1. Load dataset

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

train_full_data = pd.read_csv(join(PATH.TRAIN, 'KNOW_2017.csv'), index_col=0)
X_test          = pd.read_csv(join(PATH.TEST, 'KNOW_2017_test.csv'), index_col=0)
target          = 'knowcode'

train_full_data_ = copy(train_full_data)
X_train_full = train_full_data.drop(columns=target)
y_train_full = train_full_data[target]

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full)

oh_enc     = OneHotEncoder(sparse=False)
y_train_oh = oh_enc.fit_transform(y_train[:, None])
y_val_oh   = oh_enc.transform(y_val[:, None])

X_train.shape, y_train.shape, y_train_oh.shape, X_val.shape, y_val.shape, y_val_oh.shape, X_test.shape

((7114, 154),
 (7114,),
 (7114, 538),
 (2372, 154),
 (2372,),
 (2372, 538),
 (9486, 154))

# 2. Preprocessing

In [3]:
from analysis_tools.preprocessing import *

In [4]:
preprocessor_baseline = get_preprocessor_baseline()
data_baseline = dict(
    X_train=preprocessor_baseline.fit_transform(X_train),
    y_train=y_train,
    X_val=preprocessor_baseline.transform(X_val),
    y_val=y_val,
    X_test=preprocessor_baseline.transform(X_test)
)
for k, v in data_baseline.items():
    print(k, v.shape)

X_train (7114, 154)
y_train (7114,)
X_val (2372, 154)
y_val (2372,)
X_test (9486, 154)


In [5]:
preprocessor1 = get_preprocessor1()
data1 = dict(
    X_train=preprocessor1.fit_transform(X_train),
    y_train=y_train,
    X_val=preprocessor1.transform(X_val),
    y_val=y_val,
    X_test=preprocessor1.transform(X_test)
)
for k, v in data1.items():
    print(k, v.shape)

X_train (7114, 261)
y_train (7114,)
X_val (2372, 261)
y_val (2372,)
X_test (9486, 261)


## 2.1 Label propagation

In [None]:
def preprocess2_y(X, y):
    idxs_unknown_label = y[y == 9999999].index
    X_unknown = X.loc[idxs_unknown_label]  # deepcopy
    y_unknown = y.loc[idxs_unknown_label]
    
    model = OPTICS(n_jobs=-1)
    model.fit(X_unknown)
    preds = model.fit_predict(X_unknown)
    
    # Allocate label
    for label in pd.value_counts(model.labels_).index:
        if label > -1:
            y_unknown.iloc[np.where(preds == label)[0]] = int(f"9999999{label}")

    # Process anomalies
    idxs_anomaly = np.where(model.labels_ == -1)[0]
    for idx, idx_anomaly in enumerate(idxs_anomaly, start=1):
        y_unknown.iloc[idx_anomaly] = int(f"-9999999{idx}")

    # Process return
    y_return = copy(y)
    y_return.loc[idxs_unknown_label] = y_unknown
    return y_return.astype('category')

def postprocess2_y(y):
    return pd.DataFrame(y, dtype=str).replace("^.*9999999.*$", "9999999", regex=True).astype('category')

y = preprocess2_y(data1['X_train'], y_train)
y

In [7]:
data2 = {}
data2['X_train'] = preprocessor1.fit_transform(X_train)
data2['y_train'] = preprocess2_y(data2['X_train'], y_train)
data2['X_val']   = preprocessor1.transform(X_val)
data2['y_val']   = y_val
data2['X_test']  = preprocessor1.transform(X_test)

for k, v in data1.items():
    print(k, v.shape)

X_train (7114, 261)
y_train (7114,)
X_val (2372, 261)
y_val (2372,)
X_test (9486, 261)


# 3. Training & evaluation

In [25]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=RANDOM_STATE)
for name, data in zip(['baseline', 'proposed1', 'proposed2'], [data_baseline, data1, data2]):
    X_t, y_t = data['X_train'], data['y_train']
    X_v, y_v = data['X_val'], data['y_val']

    model.fit(X_t, y_t)
    p_t = model.predict(X_t)
    p_v = model.predict(X_v)
    
    if name == 'proposed2':
        y_t = postprocess2_y(y_t)
        y_v = postprocess2_y(y_v)
        p_t = postprocess2_y(p_t)
        p_v = postprocess2_y(p_v)

    print(f"- {name} | Train: {f1_score(y_t, p_t, average='macro'):.2f} | Val: {f1_score(y_v, p_v, average='macro')}")

- baseline | Train: 1.00 | Val: 0.41727582904140587
- proposed1 | Train: 1.00 | Val: 0.553692466942369
- proposed2 | Train: 1.00 | Val: 0.5660168113449305
CPU times: user 6min, sys: 58.4 s, total: 6min 58s
Wall time: 39.8 s
