In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score 
from imblearn.over_sampling import *
from tensorflow import keras
import tensorflow as tf
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
import matplotlib.pyplot as plt
import warnings
from fastai_timeseries import *
from numba import njit
import xgboost as xgb
np.random.seed(813306)
 

df = pd.read_excel(r'classification homework.xlsx', sheet_name='Sheet1')
X = df.iloc[:-2,2:]
X = np.array(X.iloc[:,np.where(X.apply(lambda x: np.max(x)- np.min(x)) != 0)[0]])
y = np.array(pd.Categorical(df.iloc[:-2,0]).codes) 

In [33]:
# Baseline
loo = LeaveOneOut()
loo.get_n_splits(X)
ytests = []; ypreds = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model =  make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-15))
    model.fit(X=X_train, y=y_train)
    ypreds.append(model.predict(X_test))
    ytests.append(y_test)

accuracy_score(ypreds,ytests)

0.8333333333333334

In [172]:
# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return (best_distribution.name, best_params)

def make_pdf(dist, params, size=10000):
    """Generate distributions's Probability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf

def gen_noise(data, dists, params):
    """Generate distributions's Probability Distribution Function """
    for i in range(data.shape[1]):
        dist = dists[i]
        para = params[i]
        arg = para[:-2]
        loc = para[-2]
        scale = para[-1]
        # Creating noise given distribution
        noise = 0.1 * dist.rvs(loc=loc, scale=scale, *arg,size=len(data))
        data[:,i] += noise 
    return data


In [42]:
# Fit best distribution
best_fit_names = []; best_fit_params = []
for i in range(X.shape[1]):
    best_fit_name, best_fit_param = best_fit_distribution(X.iloc[:,i].values, 50)
    print(best_fit_name,best_fit_param)
    best_fit_names.append(best_fit_name) 
    best_fit_params.append(best_fit_param)

nakagami (0.23805730054933533, -6.757214229576884e-30, 0.022350070287256746)


In [62]:
### Read parameters
para = pd.read_csv('para.csv',header=None,names=['distri','para'])
best_dists = []; paras =[]
for row in para.iterrows():
    row = row[1]
    best_dist = getattr(st, row['distri'].strip())
    para_one = [float(i) for i in row['para'].split(',')]
    best_dists.append(best_dist)
    paras.append(para_one)

In [241]:
# First approach: row-wise Augmentation and basic classifier
AUGMENTATION = 10
from sklearn.model_selection import KFold
loo = LeaveOneOut()
loo.get_n_splits(X)
ytests = []; ypreds = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gen_data = pd.DataFrame(X_train)
    for i in range(AUGMENTATION):
        tmp_data = gen_noise(X_train,best_dists,paras)
        gen_data = gen_data.append(pd.DataFrame(tmp_data),ignore_index=True)
    X_train = np.array(gen_data)
    y_train = np.tile(y_train,AUGMENTATION + 1)
    model =  make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-15))
    model.fit(X=X_train, y=y_train)
    ypreds.append(model.predict(X_test))
    ytests.append(y_test)

accuracy_score(ypreds,ytests) 

0.6060606060606061

In [34]:
# Second approach: Column wise augmentation
# https://arxiv.org/abs/1910.13051
loo = LeaveOneOut()
loo.get_n_splits(X)
ytests = []; ypreds = []
counter = 0
for train_index, test_index in loo.split(X):
    classifier = xgb.XGBClassifier(max_depth=3,
                              learning_rate=0.1,
                               n_estimators=100,
                               verbosity=0,
                               objective='multi:softprob',
                               booster='gbtree',
                               tree_method='auto',
                               n_jobs=7,
                               gpu_id=0,
                               gamma=0,
                               min_child_weight=1,
                               max_delta_step=0, #4
                               subsample=.5,
                               colsample_bytree=1,
                               colsample_bylevel=1,
                               colsample_bynode=1,
                               reg_alpha=0,
                               reg_lambda=1,
                               scale_pos_weight=1,
                               base_score=0.5,
                               random_state=0,
                               missing=None)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    kernels = generate_kernels(X.shape[1], 1000)
    X_training_transform = apply_kernels(X_train, kernels)
    gc.collect()
    X_test_transform = apply_kernels(X_test, kernels)
    gc.collect()
    X_training_transform = pd.DataFrame(X_training_transform)
    X_test_transform = pd.DataFrame(X_test_transform)
    X_training_transform = X_training_transform.replace([np.inf, -np.inf], np.nan)
    to_drop = X_training_transform.columns[X_training_transform.isna().any()]
    X_test_transform = X_test_transform.replace([np.inf, -np.inf], np.nan)
    to_drop2 = X_test_transform.columns[X_test_transform.isna().any()]
    to_drop = np.concatenate((to_drop,to_drop2))
    X_training_transform = X_training_transform.drop(to_drop,axis=1)
    X_test_transform = X_test_transform.drop(to_drop,axis=1)
    model =  make_pipeline(StandardScaler(),classifier)
    model.fit(X=X_training_transform, y=y_train)
    ypreds.append(model.predict(X_test_transform))
    ytests.append(y_test)

#accuracy_score(np.concatenate(np.array(ypreds)),np.concatenate(np.array(ytests))) 
accuracy_score(ypreds,ytests) 

0.9545454545454546