In [None]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
# from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
TRAIN_NPZ = "/kaggle/input/sampled-dataset/train_sampled.npz"
complete_csr = sparse.load_npz(TRAIN_NPZ)

TRAIN_CSV = "/kaggle/input/sampled-dataset/target.csv"
targets = pd.read_csv(TRAIN_CSV)['Target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(complete_csr, targets, test_size = 0.33, random_state = 42, stratify = targets)

In [None]:
def f1_macro_score(x_test, y_test, model):
    y_pred = model.predict(x_test)
    return f1_score(y_test, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None, zero_division='warn')

In [None]:
def partial_fit_steps(model, x_train, y_train, n_iter):
    scores = []
    max_score = 0
    best_model = model
    for i in range(1):
        for j in range(n_iter):
            model.partial_fit(x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], classes = np.unique(y_train))
        score = f1_macro_score(x_test, y_test, model)
        models = []
        models.append(model)
        if score > max_score:
            best_model = model
            max_score = score
        scores.append(score)
        print(score)
    return scores, best_model, models

In [None]:
BATCH_SIZE = 70000
from scipy.sparse import csr_matrix
# from lightgbm import LGBMClassifier

lgb = LGBMClassifier()
lgb.fit(csr_matrix.asfptype(x_train[: BATCH_SIZE*8]), csr_matrix.asfptype(y_train[: BATCH_SIZE*8]))

In [None]:
f1_macro_score(csr_matrix.asfptype(x_test), csr_matrix.asfptype(y_test), lgb)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = lgb.predict(csr_matrix.asfptype(test_csr))
soln_df.to_csv('solution_lgb.csv', index = False)

In [None]:
clf.predict(csr_matrix.asfptype(x_test), csr_matrix.asfptype(y_test))

In [None]:
import xgboost as xgb
xgb_model_extra_param = xgb.XGBClassifier(use_label_encoder=False, booster = 'dart', one_drop = 1, rate_drop = 0, 
                                        eta = 0.01, gamma = 0.2, max_depth = 4, 
                                        tree_method = 'approx', scale_pos_weight = 2, 
                                        objective = 'multi:softmax', num_class = 5)
xgb_model_extra_param.fit(x_train[:BATCH_SIZE], y_train[:BATCH_SIZE])
f1_macro_score(x_test, y_test, xgb_model_extra_param)

In [None]:
import xgboost as xgb
xgb_model_extra_param_1 = xgb.XGBClassifier(use_label_encoder = False, booster = 'dart', one_drop = 1, 
                                        rate_drop = 0, max_depth = 6, tree_method = 'approx', 
                                        objective = 'multi:softmax', num_class = 5)
xgb_model_extra_param_1.fit(x_train[:BATCH_SIZE*2], y_train[:BATCH_SIZE*2])
f1_macro_score(x_test, y_test, xgb_model_extra_param)

In [None]:
xgb_model_non_default = xgb.XGBClassifier(use_label_encoder=False, eval_metric ='merror')
xgb_model_non_default.fit(x_train[: 4 * BATCH_SIZE], y_train[: 4 * BATCH_SIZE])
f1_macro_score(x_test, y_test, xgb_model_non_default)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = xgb_model_non_default.predict(test_csr)
soln_df.to_csv('solution_xgb_2.csv', index = False)

In [None]:
BATCH_SIZE = 70000

from sklearn.linear_model import SGDClassifier
sgd_models = []
sgd = SGDClassifier(loss = 'perceptron', warm_start = True)
per_acc, per_best, per_models = partial_fit_steps(sgd, x_train, y_train, 200)

In [None]:
BATCH_SIZE = 70000

from sklearn.linear_model import SGDClassifier
sgd_models = []
sgd = SGDClassifier(loss = 'log', warm_start = True)
sgd_acc, sgd_best, sgd_models = partial_fit_steps(sgd, x_train, y_train, 200)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = per_best.predict(test_csr)
soln_df.to_csv('solution_per.csv', index = False)

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter = 100, warm_start = True, solver = 'adam', hidden_layer_sizes = (2,))

In [None]:
BATCH_SIZE = 70000

clf_acc, clf_best, clf_models = partial_fit_steps(clf, x_train, y_train, 25)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = clf_best.predict(test_csr)
soln_df.to_csv('solution_clf_25iter.csv', index = False)

In [None]:
clf_best

In [None]:
def partial_fit_steps_2(model, x_train, y_train, n_iter):
    scores = []
    max_score = 0
    best_model = model
    for i in range(1,2):
        for j in range(n_iter):
            model.partial_fit(x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], classes = np.unique(y_train))
        score = f1_macro_score(x_test, y_test, model)
        models = []
        models.append(model)
        if score > max_score:
            best_model = model
            max_score = score
        scores.append(score)
        print(score)
    return scores, best_model, models

In [None]:
BATCH_SIZE = 70000
clf_models_2 = []
clf_acc_2, clf_best_2, clf_models_2 = partial_fit_steps_2(clf, x_train, y_train, 40)

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier

pac = PassiveAggressiveClassifier()
acc_scores = partial_fit_steps(pac, x_train, y_train, 50)

In [None]:
nb = MultinomialNB()
x_train.data += abs(x_train.min())
nb.fit(abs(x_train), y_train)
x_test.data += abs(x_test.min())
f1_macro_score(abs(x_test), y_test, nb)

In [None]:
cnb = ComplementNB()
x_train.data += abs(x_train.min())
cnb.fit(abs(x_train), y_train)
x_test.data += abs(x_test.min())
f1_macro_score(abs(x_test), y_test, cnb)

In [None]:
lr1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 150)
lr1.fit(x_train, y_train)
f1_macro_score(x_test, y_test, lr1)

In [None]:
lr2 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 200)
lr2.fit(x_train, y_train)
f1_macro_score(x_test, y_test, lr2)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = clf_best.predict(test_csr)
soln_df.to_csv('solution_mlp1.csv', index = False)

In [None]:
def partial_fit_steps_all(model, x_train, y_train, n_iter):
    scores = []
    max_score = 0
    best_model = model
    for i in range(x_train.shape[0] // BATCH_SIZE):
        for j in range(n_iter):
            model.partial_fit(x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], classes = np.unique(y_train))
        score = f1_macro_score(x_test, y_test, model)
        models = []
        models.append(model)
        if score > max_score:
            best_model = model
            max_score = score
        scores.append(score)
        print(score)
    return scores, best_model, models

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier

sgd_cls = SGDClassifier(loss = 'log', warm_start = True)
bcl = BaggingClassifier(base_estimator = sgd_cls, n_estimators = 5)
bcl.fit(x_train[:BATCH_SIZE], y_train[:BATCH_SIZE])

In [None]:
f1_macro_score(x_test, y_test, bcl)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = bcl.predict(test_csr)
soln_df.to_csv('solution_bcl2.csv', index = False)

In [None]:
soln_df.head()