In [None]:
import io
import pickle
from pathlib import Path
from copy import deepcopy
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from scipy.special import softmax
import scipy.sparse

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import RandomUnderSampler



In [None]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

In [None]:
train_df = pd.read_csv("../input/Train.csv")
test_df = pd.read_csv("../input/Test.csv")


LABEL2ID = {label:i for i, label in enumerate(train_df['label'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train_df['label_ids'] = train_df['label'].map(LABEL2ID)

In [None]:
len(train_df), len(test_df)

(70000, 30000)

In [None]:
all_df = pd.concat([train_df, test_df], axis=0)

In [None]:
all_texts = pd.concat([train_df['text'], test_df['text']])

In [None]:
encoder = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)
encoder.fit(all_texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=3, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
def trainer(train_feat, targets, model, pseudo_set=None):

    val_scores = []
    val_probas = np.zeros((train_feat.shape[0], len(np.unique(targets))), dtype=np.float32)

    models = []
    for fold, (trn_idx, val_idx) in enumerate(StratifiedKFold(n_splits=5).split(train_feat, targets)):
        _model = deepcopy(model)
        print(f'fold: {fold}')

        trn_xs, val_xs = train_feat[trn_idx], train_feat[val_idx]
        trn_ys, val_ys = targets[trn_idx], targets[val_idx]

        if pseudo_set is not None:
            pseudo_targets = pseudo_set[1]
            trn_xs = scipy.sparse.vstack([trn_xs, pseudo_set[0]]).tocsr()
            trn_ys = np.concatenate([trn_ys, pseudo_targets])

        _model.fit(trn_xs, trn_ys)

        val_proba = _model.predict_proba(val_xs)
        val_preds = np.argmax(val_proba, axis=-1)

        score = accuracy_score(val_ys, val_preds)
        
        print(f'acc: {score:.5f}')
        val_probas[val_idx] = val_proba

        models.append(_model)

    val_preds = np.argmax(val_probas, axis=-1)
    all_score = accuracy_score(targets, val_preds)
    print(f'val acc: {all_score:.5f}')

    return val_probas, models

In [None]:
def predict(test_feat, models):
    if isinstance(models[0], list):
        models = sum(models, [])

    test_probas = [model.predict_proba(test_feat) for model in models]
    test_probas = np.array(test_probas).mean(axis=0)
    return test_probas

In [None]:
train_feats = encoder.transform(train_df['text'])
test_feats = encoder.transform(test_df['text'])

In [None]:
train_targets = train_df['label_ids'].values

In [None]:
oofs = []
test_probas = []

for seed in tqdm([42, 1011, 2020]):

    oof, models = trainer(train_feats, train_targets, MultinomialNB())
    test_proba = predict(test_feats, models)

    oofs.append(oof)
    test_probas.append(test_proba)

oofs = np.array(oofs).mean(axis=0)
test_probas = np.array(test_probas).mean(axis=0)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

fold: 0
acc: 0.79579
fold: 1
acc: 0.79293
fold: 2
acc: 0.79036
fold: 3
acc: 0.79779
fold: 4
acc: 0.79871
val acc: 0.79511
fold: 0
acc: 0.79579
fold: 1
acc: 0.79293
fold: 2
acc: 0.79036
fold: 3
acc: 0.79779
fold: 4
acc: 0.79871
val acc: 0.79511
fold: 0
acc: 0.79579
fold: 1
acc: 0.79293
fold: 2
acc: 0.79036
fold: 3
acc: 0.79779
fold: 4
acc: 0.79871
val acc: 0.79511



In [None]:
save_pkl(Path("../output"), "95-tfidf_test-pred.pkl", test_probas)
save_pkl(Path("../output"), "95-tfidf_val-pred.pkl", oofs)