# Proposed idea3
1. Preprocess `X`
2. Label propagation unlabeled `y(=9999999)` using `OPTICS`
3. NLP

# Import packages

In [1]:
from analysis_tools.common import *

%load_ext autoreload
%autoreload 2

np.random.seed(RANDOM_STATE)

# 2017년 데이터

# 1. Load dataset

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

train_full_data = pd.read_csv(join(PATH.TRAIN, 'KNOW_2017.csv'), index_col=0)
X_test          = pd.read_csv(join(PATH.TEST, 'KNOW_2017_test.csv'), index_col=0)
target          = 'knowcode'
nlp_cols        = ['bq4_1a', 'bq4_1b', 'bq4_1c', 'bq5_2', 'bq19_1', 'bq30', 'bq31', 'bq32', 'bq33', 'bq34', 'bq38_1']

# train_full_ratio   = 0.3
# _, train_full_data = train_test_split(train_full_data, stratify=train_full_data[target], test_size=train_full_ratio)

train_full_data_ = copy(train_full_data)
X_train_full = train_full_data.drop(columns=target)
y_train_full = train_full_data[target]

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full)

oh_enc     = OneHotEncoder(sparse=False)
y_train_oh = oh_enc.fit_transform(y_train[:, None])
y_val_oh   = oh_enc.transform(y_val[:, None])

print("- Train:", X_train.shape, y_train.shape, y_train_oh.shape)
print("- Val:", X_val.shape, y_val.shape, y_val_oh.shape)
print("- Test:", X_test.shape)

- Train: (7114, 154) (7114,) (7114, 538)
- Val: (2372, 154) (2372,) (2372, 538)
- Test: (9486, 154)


# 2. Preprocessing

In [3]:
from analysis_tools.preprocessing import *

In [4]:
preprocessor_baseline = get_preprocessor_baseline()
data_baseline = dict(
    X_train=preprocessor_baseline.fit_transform(X_train),
    y_train=y_train,
    X_val=preprocessor_baseline.transform(X_val),
    y_val=y_val,
    X_test=preprocessor_baseline.transform(X_test)
)
for k, v in data_baseline.items():
    print(k, v.shape)

X_train (7114, 154)
y_train (7114,)
X_val (2372, 154)
y_val (2372,)
X_test (9486, 154)


In [5]:
preprocessor1 = get_preprocessor1()
data1 = dict(
    X_train=preprocessor1.fit_transform(X_train),
    y_train=y_train,
    X_val=preprocessor1.transform(X_val),
    y_val=y_val,
    X_test=preprocessor1.transform(X_test)
)
for k, v in data1.items():
    print(k, v.shape)

X_train (7114, 261)
y_train (7114,)
X_val (2372, 261)
y_val (2372,)
X_test (9486, 261)


In [6]:
data2 = {}
data2['X_train'] = preprocessor1.fit_transform(X_train)
data2['y_train'] = preprocess2_y(data2['X_train'], y_train)
data2['X_val']   = preprocessor1.transform(X_val)
data2['y_val']   = y_val
data2['X_test']  = preprocessor1.transform(X_test)
for k, v in data1.items():
    print(k, v.shape)

X_train (7114, 261)
y_train (7114,)
X_val (2372, 261)
y_val (2372,)
X_test (9486, 261)


## 2.3 NLP

## 2.3.1 Translating
## 2.3.2 Embedding
**outputs**
- last_hidden_state: [n_batches, n_tokens, embedding_dim]
- pooler_output: [n_batches, embedding_dim]
- hidden_states: [n_layers, n_batches, n_tokens, embedding_dim]

In [17]:
from googletrans import Translator
import torch
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA


class Preprocessor3:
    def __init__(self, nlp_cols, selected_nlp_cols, n_components, translate=False):
        self.nlp_cols          = nlp_cols
        self.selected_nlp_cols = selected_nlp_cols
        self.n_components      = n_components
        self.translate         = translate
        self.imputer           = get_imputer()
        if self.translate:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
            self.model     = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
        else:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
            self.model     = BertModel.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)
        self.pcas = {}
    def fit_transform(self, X, y=None):
        X_imputed = self.imputer.fit_transform(X)
        X_non_nlp = X_imputed.drop(columns=self.nlp_cols)
        X_nlp     = X_imputed[self.selected_nlp_cols]
        self.dics = self._get_vector_dics(X_nlp)
        X_nlp     = self._allocate_vector(X_nlp, self.dics)
#         return X_non_nlp.join(X_nlp)
        return X_nlp

    def transform(self, X, y=None):
        X_imputed = self.imputer.transform(X)
        X_non_nlp = X_imputed.drop(columns=self.nlp_cols)
        X_nlp     = X_imputed[self.selected_nlp_cols]
        X_nlp     = self._allocate_vector(X_nlp, self.dics)
#         return X_non_nlp.join(X_nlp)
        return X_nlp
    
    def _get_vector_dics(self, X):
        dics = {}
        for col in tqdm(X):
            texts = X[col].unique()
            tasks = [delayed(self._text2vector)(text, self.tokenizer, self.model, self.translate) for text in texts]
            with ProgressBar():
                vecs = compute(*tasks)
            self.pcas[col] = PCA(n_components=self.n_components, random_state=RANDOM_STATE)
            vecs_pca = self.pcas[col].fit_transform(vecs)
            dics[col] = dict(zip(texts, vecs_pca))
        return dics
    
    @staticmethod
    def _text2vector(text, tokenizer, model, translate):
        def text2input(text):
            marked_text    = f"[CLS] {text} [SEP]"
            tokenized_text = tokenizer.tokenize(marked_text)
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            segments_ids   = [1] * len(tokenized_text)
            return torch.tensor([indexed_tokens]), torch.tensor([segments_ids])
        if translate:
            trans = Translator()
            text  = trans.translate(text, target='en').text
        with torch.no_grad():
            outputs = model(*text2input(text))  
        last_hidden_state, pooler_output, hidden_states = outputs.values()  # use only hidden_states
        token_vecs = hidden_states[-2][0]  # second to last
        return token_vecs.mean(axis=0).numpy()
    
    def _allocate_vector(self, X, dics):
        rst = pd.DataFrame(index=X.index)
        for col, dic in dics.items():
            f        = X[col]
            emb_cols = [f"{col}_{i}" for i in range(self.pcas[col].n_components_)]
            texts    = X[col].unique()
            
            unknown_texts = [text for text in texts if text not in dic]
            if unknown_texts:
                tasks = [delayed(self._text2vector)(text, self.tokenizer, self.model, self.translate) for text in unknown_texts]
                with ProgressBar():
                    unknown_vecs = compute(*tasks)
                unknown_vecs_pca = self.pcas[col].transform(unknown_vecs)
                dic.update(dict(zip(unknown_texts, unknown_vecs_pca)))
            
            for text in tqdm(X[col].unique()):
                idxs = f[f == text].index
                vec  = dic[text]
                rst.at[idxs, emb_cols] = vec
        return rst

In [18]:
def get_data3(selected_nlp_cols):
    preprocessor3 = Preprocessor3(nlp_cols, selected_nlp_cols, n_components=0.8)
    X_train_proc3 = preprocessor3.fit_transform(X_train)
    X_val_proc3   = preprocessor3.transform(X_val)
    X_test_proc3  = preprocessor3.transform(X_test)

    data3 = dict(
        X_train=X_train_proc3,
        y_train=preprocess2_y(X_train_proc3, y_train),
        X_val=X_val_proc3,
        y_val=y_val,
        X_test=X_test_proc3
    )
    for k, v in data3.items():
        print(k, v.shape)
    return data3

In [None]:
data3_1 = get_data3(['bq30'])
data3_4 = get_data3(['bq30', 'bq31', 'bq32', 'bq33'])

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/7114 [00:00<?, ?it/s]

[########################################] | 100% Completed | 36.8s


  0%|          | 1/7114 [00:37<73:16:35, 37.09s/it]
100%|██████████| 979/979 [00:00<00:00, 1155.28it/s]


[########################################] | 100% Completed |  9.8s


100%|██████████| 405/405 [00:00<00:00, 1132.58it/s]


[########################################] | 100% Completed | 35.8s


100%|██████████| 1177/1177 [00:01<00:00, 1118.09it/s]


X_train (7114, 70)
y_train (7114,)
X_val (2372, 70)
y_val (2372,)
X_test (9486, 70)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/7114 [00:00<?, ?it/s]

[########################################] | 100% Completed | 37.1s


  0%|          | 1/7114 [00:37<73:55:08, 37.41s/it]

[########################################] | 100% Completed | 11min  8.9s


  0%|          | 2/7114 [11:47<809:09:17, 409.58s/it]

[########################################] | 100% Completed | 39.9s


  0%|          | 3/7114 [12:27<475:57:15, 240.96s/it]

[########################################] | 100% Completed |  1min  0.4s


  0%|          | 4/7114 [13:28<399:13:03, 202.14s/it]
100%|██████████| 979/979 [00:00<00:00, 1103.04it/s]
 87%|████████▋ | 4527/5180 [00:57<00:07, 82.14it/s]

# 3. Training & evaluation

In [14]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=RANDOM_STATE)
for name, data in zip(['baseline', 'proposed1', 'proposed2', 'proposed3'], [data_baseline, data1, data2, data3]):
    X_t, y_t = data['X_train'], data['y_train']
    X_v, y_v = data['X_val'], data['y_val']

    model.fit(X_t, y_t)
    p_t = model.predict(X_t)
    p_v = model.predict(X_v)
    
    if name != 'proposed1':
        y_t, y_v = postprocess2_y(y_t), postprocess2_y(y_v)
        p_t, p_v = postprocess2_y(p_t), postprocess2_y(p_v)

    print(f"- {name} | Train: {f1_score(y_t, p_t, average='macro'):.2f} | Val: {f1_score(y_v, p_v, average='macro')}")

- baseline | Train: 1.00 | Val: 0.41727582904140587
- proposed1 | Train: 1.00 | Val: 0.5602074643542937
- proposed2 | Train: 1.00 | Val: 0.5677837322466011
- proposed3 | Train: 1.00 | Val: 0.5536757924257704
CPU times: user 25min 44s, sys: 2min 7s, total: 27min 52s
Wall time: 2min 14s


In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=RANDOM_STATE)
for name, data in zip(['proposed3_1', 'proposed3_4'], [data3_1, data3_4]):
    X_t, y_t = data['X_train'], data['y_train']
    X_v, y_v = data['X_val'], data['y_val']

    model.fit(X_t, y_t)
    p_t = model.predict(X_t)
    p_v = model.predict(X_v)
    
    if name != 'proposed1':
        y_t, y_v = postprocess2_y(y_t), postprocess2_y(y_v)
        p_t, p_v = postprocess2_y(p_t), postprocess2_y(p_v)

    print(f"- {name} | Train: {f1_score(y_t, p_t, average='macro'):.2f} | Val: {f1_score(y_v, p_v, average='macro')}")