In [1]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
# from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/traintestcsr-codeproject/train.npz
/kaggle/input/traintestcsr-codeproject/test.npz
/kaggle/input/traintestpkls-codeproject/train_pickle.pkl
/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl
/kaggle/input/traintestpkls-codeproject/test_pickle.pkl
/kaggle/input/data-for-codeproject-contest/our_contest_train_2.csv
/kaggle/input/data-for-codeproject-contest/our_contest_sample_solution_2.csv
/kaggle/input/data-for-codeproject-contest/our_contest_test.csv


In [2]:
TRAIN_NPZ = "/kaggle/input/traintestcsr-codeproject/train.npz"
complete_csr = sparse.load_npz(TRAIN_NPZ)

TRAIN_CSV = "/kaggle/input/data-for-codeproject-contest/our_contest_train_2.csv"
targets = pd.read_csv(TRAIN_CSV)['Target']

In [3]:
x_train, x_test, y_train, y_test = train_test_split(complete_csr, targets, test_size = 0.33, random_state = 42, stratify = targets)

In [4]:
def f1_macro_score(x_test, y_test, model):
    y_pred = model.predict(x_test)
    return f1_score(y_test, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None, zero_division='warn')

In [5]:
def partial_fit_steps(model, x_train, y_train, n_iter):
    scores = []
    max_score = 0
    best_model = model
    for i in range(x_train.shape[0] // BATCH_SIZE):
        for j in range(n_iter):
            model.partial_fit(x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE], classes = np.unique(y_train))
        score = f1_macro_score(x_test, y_test, model)
        if score > max_score:
            best_model = model
            max_score = score
        scores.append(score)
        print(score)
    return scores, best_model

In [6]:
BATCH_SIZE = 70000

from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss = 'log')
sgd_acc, sgd_best = partial_fit_steps(sgd, x_train, y_train, 200)

0.19788686390360508
0.23146710747660187
0.22604996510863726
0.2568111160552469
0.2061677611250023
0.23078309398347815
0.23780151256158635
0.2200652237442824
0.24558913393034
0.22040140822442794
0.1986620352940521
0.24807427066019008
0.21878369148293442
0.2313736394435269
0.22833042493427577
0.2503783140826362


In [7]:
TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
test_csr = sparse.load_npz(TEST_NPZ)

soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
soln_df['Target'] = sgd_best.predict(test_csr)
soln_df.to_csv('solution_lr.csv', index = False)

In [8]:
from sklearn.linear_model import PassiveAggressiveClassifier

pac = PassiveAggressiveClassifier()
acc_scores = partial_fit_steps(pac, x_train, y_train, 50)

0.20711940585158634
0.21193015483988548
0.2046574372895305
0.21215078727414877
0.21131991698292624
0.20899110983801802
0.21463292712509058
0.22237307271019552
0.24247068409366457
0.2170444424618013
0.22187739408341273
0.25420789445825165
0.23359404921355026
0.21081554101343847
0.22826019833930475
0.2148072789033233


In [9]:
# nb = MultinomialNB()
# x_train.data += abs(x_train.min())
# nb.fit(abs(x_train), y_train)
# x_test.data += abs(x_test.min())
# f1_macro_score(abs(x_test), y_test, nb)

In [10]:
# cnb = ComplementNB()
# x_train.data += abs(x_train.min())
# cnb.fit(abs(x_train), y_train)
# x_test.data += abs(x_test.min())
# f1_macro_score(abs(x_test), y_test, cnb)

In [11]:
# lr1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 150)
# lr1.fit(x_train, y_train)
# f1_macro_score(x_test, y_test, lr1)

In [12]:
# lr2 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', class_weight = 'balanced', max_iter = 200)
# lr2.fit(x_train, y_train)
# f1_macro_score(x_test, y_test, lr2)

In [13]:
# TEST_NPZ = "/kaggle/input/traintestcsr-codeproject/test.npz"
# test_csr = sparse.load_npz(TEST_NPZ)

# soln_df = pd.read_pickle('/kaggle/input/traintestpkls-codeproject/soln_pickle.pkl')
# soln_df['Target'] = lr1.predict(test_csr)
# soln_df.to_csv('solution_lr.csv', index = False)

In [14]:
# soln_df.head()