In [1]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score, log_loss, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from numpy.testing import assert_almost_equal
from functools import reduce
import matplotlib.pyplot as plt

from scipy.special import comb as choose, factorial
comb = lambda x,y: choose(x,y, exact=True)
def the_count(k, n): 
    # for polynomial regression feature counting
    return comb(n+k, k)
def the_count_recursive(k,n):
    if k==0: return 1
    else: return comb(n+k-1, k) + the_count_recursive(k-1, n)


In [25]:
## KAGGLE bioresponse https://www.kaggle.com/c/bioresponse#Evaluation

train_url = 'data/train.csv'
test_url = 'data/test.csv' ## iggnoring-- it doesn't have 'Activity' 

df_ = pd.read_csv(train_url)
#df_test = pd.read_csv(test_url) # doesn't have 'Activity'
assert all([x==0 for x in df_.isna().sum().values])
assert all([pd.api.types.is_numeric_dtype(df_[feat]) for feat in df_.columns])
dependent='Activity'

X_train, X_test, y_train, y_test = train_test_split(df_.drop(dependent, axis=1), 
                                                    df_[dependent], 
                                                    train_size=0.8, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# df_.dtypes.value_counts()

# #df_.describe()

# df_.mean().array.mean()

print(np.divide(df_.shape[0], df_.shape[1]))

(3000, 1776) (751, 1776) (3000,) (751,)
2.1108610016882388


In [22]:
%%time 

M = df_.shape[1]
print(the_count_recursive(M, 4))
# pipeline1 = make_pipeline(
#     #ce.OneHotEncoder(use_cat_names=True), # no categoricals. 
#     #StandardScaler(), # means are very much near zero anyway, we don't really need this. 
#     LogisticRegression()
# )


# pipeline1.fit(X_train, y_train)
# y_pred = pipeline1.predict(X_test)
# #accuracy_score(y_test, y_pred)
# log_loss(y_test, y_pred)

# scores = cross_val_score(pipeline1, X_train, y_train, cv=10, scoring='neg_log_loss') 

# scores#.mean() # don't run this a lot because it is EXPENSIVE

# # when test_size=0.15, scores.mean is better than accuracy_score... but when test_size=0.2, scores.mena is WORSE. 

417811495465
CPU times: user 2.06 ms, sys: 0 ns, total: 2.06 ms
Wall time: 2.03 ms


###  The above is the "dumbest possible model" we start with. Now we're going to play more. 

In [7]:
# from https://scikit-learn.org/stable/modules/compose.html#pipeline


# # We create the preprocessing pipelines for both numeric and categorical data.
# numeric_features = ['age', 'fare']
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

# categorical_features = ['embarked', 'sex', 'pclass']
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])

# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', LogisticRegression(solver='lbfgs'))])



lets dimension-reduction

we'll split the data into two different things, ints and floats. 

a commenter [here](https://stats.stackexchange.com/questions/159705/would-pca-work-for-boolean-binary-data-types) said that a "cosine"ish version of PCA is more appropriate for bool 0,1. 

### also, We're gonna use the heuristic that, for N=#observations and M=#features, N > 5M

In [149]:
%%time

int_feats = df_.drop(dependent, axis=1).select_dtypes(include='int').columns
float_feats = df_.drop(dependent, axis=1).select_dtypes(include='float').columns

ints_weight = np.divide(len(int_feats), df_.shape[1]-1)
floats_weight = np.divide(len(float_feats), df_.shape[1]-1)
assert_almost_equal(ints_weight+floats_weight, 1, 3)

B = 25
n = int(np.divide(df_.shape[0]-1, B))
int_compon = int(ints_weight * n)
float_compon = int(floats_weight * n)
assert_almost_equal(float_compon+int_compon+1, n, 3)

discrete_transformer = Pipeline(steps=[
     ('exp', FunctionTransformer(np.exp, validate=True)),
     ('FA', FeatureAgglomeration(n_clusters=int_compon, linkage='average', affinity='cosine', 
                                 memory='cache/'))])

continuous_transformer = Pipeline(steps=[
    ('exp', FunctionTransformer(np.exp, validate=True)),
    ('pca', PCA(n_components=float_compon))
       ])

preprocessor = ColumnTransformer(
    transformers=[
        ('discrete', discrete_transformer, int_feats),
        ('continuous', continuous_transformer, float_feats)])

# # Append classifier to preprocessing pipeline.
### Now we have a full prediction pipeline.
clf = Pipeline(steps=[#('logarithm', FunctionTransformer(np.log1p)),
                      ('preprocessor', preprocessor),
                      #('logarithm', FunctionTransformer(np.log1p)),
                      ('scaler', StandardScaler(with_std=False)),
                      ('classifier', SGDClassifier(loss='log', verbose=10,
                                                   tol=np.exp(-B * 0.1), max_iter=1234))])





CPU times: user 801 ms, sys: 60.1 ms, total: 861 ms
Wall time: 859 ms


In [154]:
%%time

grid_parameters = {'classifier__penalty': ['l1', 'l2']}

scr = GridSearchCV(clf, df_.drop(dependent, axis=1), df_[dependent], #scoring='neg_log_loss', 
                   cv=3, n_jobs=3)#, param_grid=grid_parameters)

scr.fit(X_train, y_train)
scr.best_estimator_
#print(f'we came to a score mean of {-scr.mean()}, which is pretty good.')
# am i crazy or is this an extremely good result? 

ValueError: Parameter values for parameter (D1) need to be a sequence(but not a string) or np.ndarray.

In [146]:
clf.fit(X_train, y_train)
predictions = [t[0] for t in clf.predict_proba(X_test)]

print('Our observations outnumber our model features by a ' + \
      f'factor of {df_.shape[1] / (clf.steps[-1][1].coef_).shape[1]}')

feat_agglom = clf.steps[0][1].transformers[0][1].steps[1][1]
p_c_a = clf.steps[0][1].transformers[1][1].steps[1][1]

feat_agglom, p_c_a

clf.steps[-1][1]

Our observations outnumber our model features by a factor of 11.926174496644295


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1234,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True,
       tol=0.0820849986238988, validation_fraction=0.1, verbose=0,
       warm_start=False)

In [91]:
-scr.mean() ## this was the best i got! 

1.1492562919266491

In [81]:
help(StandardScaler)

Help on class StandardScaler in module sklearn.preprocessing.data:

class StandardScaler(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Standardize features by removing the mean and scaling to unit variance
 |  
 |  The standard score of a sample `x` is calculated as:
 |  
 |      z = (x - u) / s
 |  
 |  where `u` is the mean of the training samples or zero if `with_mean=False`,
 |  and `s` is the standard deviation of the training samples or one if
 |  `with_std=False`.
 |  
 |  Centering and scaling happen independently on each feature by computing
 |  the relevant statistics on the samples in the training set. Mean and
 |  standard deviation are then stored to be used on later data using the
 |  `transform` method.
 |  
 |  Standardization of a dataset is a common requirement for many
 |  machine learning estimators: they might behave badly if the
 |  individual features do not more or less look like standard normally
 |  distributed data (e.g. Gaussian with 0 mean 

In [68]:

np.exp([k for k in [0.1, 0.2, 0.3, 0.4]])


array([1.10517092, 1.22140276, 1.34985881, 1.4918247 ])

In [131]:
sklearn.__version__

'0.20.2'

In [156]:
# a = (1,2,3)

len(a)

3