In [None]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
# from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import xgboost as xgb

from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
TRAIN_NPZ = "/kaggle/input/sampled-dataset/train_sampled.npz"
complete_csr = sparse.load_npz(TRAIN_NPZ)

TRAIN_CSV = "/kaggle/input/sampled-dataset/target.csv"
targets = pd.read_csv(TRAIN_CSV)['Target']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(complete_csr, targets, test_size = 0.33, random_state = 42, stratify = targets)

In [None]:
def f1_macro_score(x_test, y_test, model):
    y_pred = model.predict(x_test)
    return f1_score(y_test, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None, zero_division='warn')

In [None]:
def partial_fit_steps(model, x_train, y_train, n_iter):
    scores = []
    max_score = 0
    best_model = model
    for i in range(x_train.shape[0] // BATCH_SIZE):
        for j in range(n_iter):
            model.partial_fit(x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], classes = np.unique(y_train))
        score = f1_macro_score(x_test, y_test, model)
        if score > max_score:
            best_model = model
            max_score = score
        scores.append(score)
        print(score)
    return scores, best_model

In [None]:
BATCH_SIZE = 70000

from sklearn.linear_model import SGDClassifier

def generateSGDClassifier(loss_func, n_iter, penalty_mark):
    sgd = SGDClassifier(loss = loss_func, penalty = penalty_mark, warm_start = True)
    return partial_fit_steps(sgd, x_train, y_train, n_iter)

In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False)
xgb_model.fit(x_train[: BATCH_SIZE], y_train[: BATCH_SIZE])
f1_macro_score(x_test, y_test, xgb_model)

In [None]:
xgb_model_non_default = xgb.XGBClassifier(use_label_encoder=False, booster = 'dart', skip_drop = 0.7, eval_metric = 'merror')
xgb_model_non_default.fit(x_train[: int(3.5 * BATCH_SIZE)], y_train[: int(3.5 * BATCH_SIZE)])
f1_macro_score(x_test, y_test, xgb_model_non_default)

In [None]:
BATCH_SIZE = 70000 * 3
xgb_model_extra_param = xgb.XGBClassifier(use_label_encoder=False, booster = 'dart', one_drop = 1, rate_drop = 0, 
                                        max_depth = 4, 
                                        alpha = 0.1,
                                        tree_method = 'approx', 
                                        objective = 'multi:softmax', num_class = 5)
xgb_model_extra_param.fit(x_train[: BATCH_SIZE], y_train[: BATCH_SIZE])
f1_macro_score(x_test, y_test, xgb_model_extra_param)

In [None]:
mod_huber_scores, mod_huber_model = generateSGDClassifier('modified_huber', 200, 'elasticnet')

In [None]:
xgb_model_non_default = xgb.XGBClassifier(use_label_encoder=False, eval_metric ='merror', verbosity = 0, eta = 0.03, 
                                          max_depth = 5, sampling_method = 'uniform',
                                         tree_method = 'approx', sketch_eps = 0.01)
xgb_model_non_default.fit(x_train[: 3.5 * BATCH_SIZE], y_train[: 3.5 * BATCH_SIZE])
f1_macro_score(x_test, y_test, xgb_model_non_default)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = xgb_model_non_default.predict(test_csr)
soln_df.to_csv('solution_xgb_gb1.csv', index = False)

In [None]:
nb = MultinomialNB()
nb.fit(abs(x_train), y_train)
f1_macro_score(abs(x_test), y_test, nb)

In [None]:
cnb = ComplementNB()
x_train.data += abs(x_train.min())
cnb.fit(abs(x_train), y_train)
x_test.data += abs(x_test.min())
f1_macro_score(abs(x_test), y_test, cnb)

In [None]:
lr1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 150)
lr1.fit(x_train, y_train)
f1_macro_score(x_test, y_test, lr1)

In [None]:
lr2 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 200)
lr2.fit(x_train, y_train)
f1_macro_score(x_test, y_test, lr2)

In [None]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = lr1.predict(test_csr)
soln_df.to_csv('solution_lr.csv', index = False)

In [None]:
soln_df.head()