In [73]:
import numpy as np
from intfeat import StrumLiouvilleTransformer
import optuna
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import SplineTransformer, KBinsDiscretizer, StandardScaler, FunctionTransformer

In [74]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Generate dataset

In [75]:
X_train, y_train = fetch_20newsgroups(return_X_y=True, subset='train')
X_test, y_test = fetch_20newsgroups(return_X_y=True, subset='test')
num_tokens = 20
num_funcs = 20

In [76]:
y_train = (y_train == 10)
y_test = (y_test == 10)

In [77]:
logreg_params = dict(solver='newton-cholesky', Cs=20)

In [78]:
def make_tfidf_pipeline():
    return make_pipeline(
        TfidfVectorizer(),
        SelectKBest(k=num_tokens),
        LogisticRegressionCV(**logreg_params)
    )

In [79]:
def make_count_pipeline():
    return make_pipeline(
        CountVectorizer(),
        SelectKBest(k=num_tokens),
        FunctionTransformer(lambda X: X.toarray(), accept_sparse=True),
        StandardScaler(),
        LogisticRegressionCV(**logreg_params)
    )

In [80]:
def make_count_bins_pipeline(strategy='uniform'):
    return make_pipeline(
        CountVectorizer(),
        SelectKBest(k=num_tokens),
        FunctionTransformer(lambda X: X.toarray(), accept_sparse=True),
        KBinsDiscretizer(n_bins=num_funcs, strategy=strategy, quantile_method='averaged_inverted_cdf'),
        LogisticRegressionCV(**logreg_params, fit_intercept=False)
    )

In [81]:
def make_spline_pipeline(knots='uniform', degree=3):
    n_knots = num_funcs - (degree - 1)
    return make_pipeline(
        CountVectorizer(),
        SelectKBest(k=num_tokens),
        FunctionTransformer(lambda X: X.toarray(), accept_sparse=True),
        SplineTransformer(n_knots=n_knots, knots=knots, degree=degree, include_bias=False),
        LogisticRegressionCV(**logreg_params)
    )

In [82]:
def make_count_sl_pipeline(curvature_gamma=0.8):
    return make_pipeline(
        CountVectorizer(),
        SelectKBest(k=num_tokens),
        StrumLiouvilleTransformer(num_funcs=num_funcs, curvature_gamma=curvature_gamma),
        LogisticRegressionCV(**logreg_params)
    )

In [83]:
pipeline = make_tfidf_pipeline().fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8502377140102465

In [84]:
pipeline = make_count_pipeline().fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8099886615459158

In [85]:
pipeline = make_count_bins_pipeline().fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8128454108775374

In [86]:
pipeline = make_count_bins_pipeline(strategy='quantile').fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.5

In [87]:
pipeline = make_spline_pipeline().fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8168150995742546

In [88]:
pipeline = make_spline_pipeline(knots='quantile').fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8187605913704772

In [89]:
pipeline = make_count_sl_pipeline().fit(X_train, y_train)
roc_auc_score(y_test, pipeline.predict(X_test))

0.8153516765416977