In [1]:
import numpy as np
import pandas as pd
import Perceptron.perceptron as pn
from Perceptron.data_gen import Universe, separable_regression, data_distribution
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, zero_one_loss
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import defaultdict
import pickle
import math
from tqdm import tqdm
from argparse import ArgumentParser, RawTextHelpFormatter 
import random
from scipy.io import arff
import matplotlib.pyplot as plt


## Data Corruption Experiment
from typing import List, Tuple
import random


In [2]:
# Utility Functions
def pickle_data(
    root_dir, 
    results,
    args):
    
    # Make sure it is a directory!
    if root_dir[-1] != '/':
        root_dir += '/'
    
    # Create pickle structure
    pkl = {
        'results': dict(results),
        'args':    args,
    }
    
    
    # Create file name

    file_name = f"{args.label}_test_size_{args.test_size}.pkl"
    
    with open(f"{root_dir}{file_name}", 'wb') as pkl_file:
        pickle.dump(pkl, pkl_file)


## Theoretical Machine Learning Functions

The functions bellow provide wrappers for better interpreting theory from the literature.

In [3]:
# Experiment required functions
def sample_data(
    lows:      List[float],
    highs:     List[float],
    n_samples: int,
    seed:      int=None
) -> List[List[int]]:
    """Sample uniform distribution bounded by lows and highs
    
        Using a uniform distribution, perform sampling over the 
    distribution such that the space the distribution is sampling will 
    be bounded by the given bounds from the lows and highs. Lows and 
    highs will be arrays that contain the minimum and maximum values 
    per dimension on the data to be samples. For example, if we have 4 
    values in both lows and highs, then, at the time of sampling n_samples
    samples we will have n_samples of 4 attributes each: (n_samples, 4).
    """
    
    assert len(lows) == len(highs), f"Non-matching lows and highs: {len(lows) != {len(highs)}}"
    
    rng = np.random.default_rng(seed)
    data_shape = (n_samples, len(lows)) # See assertion #1
    data = rng.uniform(lows, highs, data_shape)
    return data

# splitting the dataset into bins can be done with: np.split(data, n_buckets)
# Recommend shuffling beforehand tho.

class Concept:
    """Label given data
    Using a model as truth, label given data.
    """
    def __init__(self, model):
        self.model = model
        
    def __call__(self, X):
        return self.model.solve(X)
    
    
class NPolynomial:
    
    def __init__(self, 
                 n:    int, 
                 low:  float=0, 
                 high: float=1, 
                 seed: int=42
                ):
        self.n     = n
        self.seed  = seed
        self.low   = low
        self.high  = high
        rng        = np.random.default_rng(seed)
        self.coeff = rng.uniform(low, high, (n, 1))
        self.exps  = [exp for exp in range(n)[::-1]]
        
    def solve(self, vals):
        var = np.power(vals, self.exps)
        activation = np.sign(var @ self.coeff)                                        
        activation[activation == 0] = -1
        return activation
    


## Data Corruption Functions and Experiment

These functions are used for carrying out data corruption and experiments.

In [4]:
'''Data Corruption Code for Experiments'''

def perceptron_data_corruption(
        train_data,
        train_labels,
        test_data,
        test_labels,
        model_params,
        verbose,
        history,
        seed,
        ):
        '''Corrupt given bucketized data

        Parameters
        ----------

        train_data
            Training data that is already bucketized
        train_labels
            Training labels that is already bucketized
        test_data
            Testing dataset
        test_labels
            Testing dataset labels
        model_params
            Perceptron model hyperparameters to train on
        verbose
            specify verbosity of messages
        history
            Shall be a dictionary that has initialized key-value pairs.
            The keys shall contain all of the buckets to be used. The
            values shall be lists that may or may not already contain
            scores from previous runs.
        seed
            Random seed used when choosing indices
        '''

        # Calculate number of buckets user is passing.
        n_buckets = len(train_data) # could add min/max params to specify buckets!
        
        rng = np.random.default_rng(seed)
        L_values = []
        
        # Begin with high corruption and then add more buckets
        for buckets in range(1, n_buckets):

            if verbose > 1:
                print(f"\tBuckets used: {buckets}")
            # Choose buckets to be used.
            indices = rng.choice(range(1, len(train_data)), size=buckets, replace=False)
            X       = np.concatenate(train_data[indices])
            Y       = np.concatenate(train_labels[indices])
            if verbose > 2:
                print(f"\tData points used: {len(X)}")

            # Train model
            model = pn.PocketPerceptron(**model_params)
            model.train(X, Y)
            pred = model.solve(test_data)

            # Measure zero-one & store
            score = accuracy_score(pred, test_labels)
            #score_list.append(score)

            if verbose > 3:
                print(f"\t\tScore: {score}")
            history[buckets].append(score)
            # Used by Gallant's learning bound.
            L_values.append(np.linalg.norm(model.W))
            
        history['L'].append(L_values)
        
        return history
    

def perceptron_corruption_experiment(
    X,
    y,
    test_size,
    n_buckets,
    model_params,
    n_runs,
    seed,
    verbose,
    ):
    '''Conduct corruption experiment and report results
    
    parameters
    ----------
    X
        Dataset to train/test on.
    y
        Labels of dataset to train/test on.
    test_size
        Size of testing dataset to be split into. See StratifiedShuffleSplit f-
        rom sklearn.
    n_buckets
        Number of buckets to split training data into.
    model_parameters
        Dictionary containing Pocket Perceptron algorithm's constructor parame-
        ters. See Perceptron.perceptron.PocketPerceptron for list.
    n_runs
        Number of experiment iterations where during each iteration data is co-
        rrupted progressively.
    seed
        Random seed for generators used in concept and model initialization.
    verbose
        Specify verbosity of output.
    '''
    
    # Will have n_runs scores per bucket size used for training.
    history   = {buckets: [] for buckets in range(1, n_buckets)}
    # Magnitude of learned vector. Will contain lists of magnitudes.
    history['L'] = []
    

    
    for run in range(n_runs):
        if verbose > 0:
            print(f"Run #{run}")
            
        '''Creation of training/testing datasets (bucketized)'''
        sss = StratifiedShuffleSplit(
            n_splits=1, 
            test_size=test_size, 
            random_state=seed + run # This way data is shuffled differently every run!
        )
        for train_i, test_i in sss.split(X, y):
            train_data, train_labels = X[train_i], y[train_i]
            test_data, test_labels = X[test_i], y[test_i]
        # We just need to bucketize the training data now (Testing data used as is)
        train_data   = np.array_split(train_data, n_buckets) # split rises exception if not even!
        train_data   = np.array(train_data) # Helps in keeping bucket structure
        train_labels = np.array_split(train_labels, n_buckets)
        train_labels = np.array(train_labels)


        ''' Conduct corruption and obtain scores '''
        history = perceptron_data_corruption(
            train_data,
            train_labels,
            test_data,
            test_labels,
            model_params,
            verbose,
            history,
            seed=run,
        )
        
    return history

# Data Corruption Experiment
## Synthetic Dataset -- Linearly Separable

In [None]:
# Lower and upper bounds for data distribution PER dimension
lows      = [-10, -10, -10, -10] + [1] # Bias added by use of [1]
highs     = [10, 10, 10, 10] + [1]
ins       = 5 # 4 attributes, 1 bias
data_size = 1_200

# Perceptron hyper-parameters
model_params = {
    'input'      : ins,
    'eta'        : 0.5,
    'max_iter'   : 2_000,
    'rand_seed'  : None,
    'ignore_flag': False,
}

'''Choose some concept to learn'''
rng      = np.random.default_rng(42) # For reproducibility
W        = np.concatenate([ rng.uniform(-100, 100, (ins, 1)) ])
truth    = pn.PocketPerceptron()
truth.pi = truth.W = W
concept  = Concept(truth) # Concept is just a wrapper. Do truth.predict for same result

'''Sample training and testing data'''
# We sample separately the data from the uniform distribution. Then, we label according
# to the concept (perceptron with weights W)
data   = sample_data(lows, highs, n_samples=data_size, seed=42)
data   = np.array(data) # Helps in keeping bucket structure
labels = concept(data)


history_syn_lin = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 100,
    model_params    = model_params,
    n_runs          = 100,
    seed            = 42,
    verbose         = 2
)
'''
# Testing Purposes
history_syn_lin = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 20,
    model_params    = model_params,
    n_runs          = 10,
    seed            = 42,
    verbose         = 2
)
'''

In [None]:
pd.DataFrame(history_syn_lin).mean().plot(figsize=(10, 5))
plt.grid(True)
plt.title("Buckets vs Zero-One Loss; Synthetic Separable")
plt.xlabel("Number of Buckets")
plt.ylabel("Zero-One Loss")
plt.xticks(range(0, 51, 5))
plt.yticks(np.linspace(0.82, 0.95, 14))

In [None]:
pd.DataFrame(history_syn_lin).boxplot(figsize=(15, 7))

## Synthetic Dataset -- Non-Linearly Separable

In [None]:
# Lower and upper bounds for data distribution PER dimension
lows      = [-10, -10, -10, -10] + [1] # Bias added by use of [1]
highs     = [10, 10, 10, 10] + [1]
ins       = 5 # 4 attributes, 1 bias
data_size = 1_200

# Perceptron hyper-parameters
model_params = {
    'input'      : ins,
    'eta'        : 0.5,
    'max_iter'   : 2_000,
    'rand_seed'  : None,
    'ignore_flag': True,
}

'''Choose some concept to learn'''
truth    = NPolynomial(ins, -10, 10, 42)
concept  = Concept(truth) # Concept is just a wrapper. Do truth.predict for same result

'''Sample training and testing data'''
# We sample separately the data from the uniform distribution. Then, we label according
# to the concept (perceptron with weights W)
data   = sample_data(lows, highs, n_samples=data_size, seed=42)
data   = np.array(data) # Helps in keeping bucket structure
labels = concept(data)

'''
history_non_syn_lin = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 100,
    model_params    = model_params,
    n_runs          = 100,
    seed            = 42,
    verbose         = 2
)
'''
# Testing Purposes
history_non_syn_lin = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 20,
    model_params    = model_params,
    n_runs          = 10,
    seed            = 42,
    verbose         = 2
)


In [None]:
pd.DataFrame(history_non_syn_lin).mean().plot(figsize=(10, 5))
plt.grid(True)
plt.title("Buckets vs Zero-One Loss; synthetic Non-Separable")
plt.xlabel("Number of Buckets")
plt.ylabel("Zero-One Loss")
plt.xticks(range(0, 51, 5))
plt.yticks(np.linspace(0.82, 0.95, 14))

In [None]:
pd.DataFrame(history_non_syn_lin).boxplot(figsize=(15, 7))

## Real World Data -- Skin/No Skin Non-Separable

In [None]:
# Import and prepare data
data = arff.loadarff('./datasets/skinNoSkin.arff')
skin = pd.DataFrame(data[0])
skin['bias'] = 1
# Data cleaning
skin.replace(b'1', -1, inplace=True)
skin.replace(b'2', 1, inplace=True)
data = skin.drop('Class', axis=1).assign(bias=1)
data = data.to_numpy()
labels = skin.Class



# Perceptron hyper-parameters
ins = data.shape[-1]
model_params = {
    'input'      : ins,
    'eta'        : 0.5,
    'max_iter'   : 2_000,
    'rand_seed'  : None,
    'ignore_flag': True,
}



'''
skin_history        = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 100,
    model_params    = model_params,
    n_runs          = 100,
    seed            = 42,
    verbose         = 2
)
'''
# Testing Purposes
skin_history        = perceptron_corruption_experiment(
    X               = data[:200],
    y               = labels[:200],
    test_size       = 0.2,
    n_buckets       = 20,
    model_params    = model_params,
    n_runs          = 10,
    seed            = 42,
    verbose         = 2
)


In [None]:
pd.DataFrame(skin_history).mean().plot(figsize=(10, 5))
plt.grid(True)
plt.title("Buckets vs Zero-One Loss; Skin/No Skin")
plt.xlabel("Number of Buckets")
plt.ylabel("Zero-One Loss")
plt.xticks(range(0, 51, 5))
plt.yticks(np.linspace(0.82, 0.95, 14))

In [None]:
pd.DataFrame(skin_history).boxplot(figsize=(15, 7))

### Save results as pickle

In [None]:

skin_pickle = {
    'history':    skin_history,
    'n_data':     500, # We know from data-set description. 
    'test_split': 100000,
    'n_runs':     n_runs,
    'n_buckets':  n_buckets,
    'max_iter':   max_iter,
    'n_attribs':  n_attribs,
}
with open('skin_results.pkl', 'wb') as pkl:
    pickle.dump(skin_pickle, pkl)
    
#with open('skin_pickle.pkl', 'rb') as pkl:
#    some_dict = pickle.load(pkl)

## Real World Data -- Iris Separable

In [None]:
iris = datasets.load_iris()

data = pd.DataFrame(iris.data)
data['bias'] = 1
targets = pd.DataFrame(iris.target)

targets.replace(0, -1, inplace=True)
targets.replace(1, 1, inplace=True)
targets.replace(2, 1, inplace=True)

data = data.to_numpy()
labels = targets.to_numpy()

# Perceptron hyper-parameters
ins          = data.shape[-1]
model_params = {
    'input'      : ins,
    'eta'        : 0.5,
    'max_iter'   : 2_000,
    'rand_seed'  : None,
    'ignore_flag': True,
}



'''
iris_history = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 100,
    model_params    = model_params,
    n_runs          = 100,
    seed            = 42,
    verbose         = 2
)
'''
# Testing Purposes
iris_history = perceptron_corruption_experiment(
    X               = data,
    y               = labels,
    test_size       = 0.2,
    n_buckets       = 20,
    model_params    = model_params,
    n_runs          = 10,
    seed            = 42,
    verbose         = 2
)


In [None]:
pd.DataFrame(iris_history).mean().plot(figsize=(10, 5))
plt.grid(True)
plt.title("Buckets vs Zero-One Loss; Iris Separable")
plt.xlabel("Number of Buckets")
plt.ylabel("Zero-One Loss")
plt.xticks(range(0, 101, 5))
plt.yticks(np.linspace(0.5, 1, 14))

In [None]:
pd.DataFrame(iris_history).boxplot(figsize=(15, 7))

### Save results as pickle

In [None]:
import pickle
iris_pickle = {
    'history':    iris_history,
    'n_data':     150, # We know from data-set description. 
    'test_split': 0.2,
    'n_runs':     n_runs,
    'n_buckets':  n_buckets,
    'max_iter':   max_iter,
    'n_attribs':  4+1,
}
with open('iris_results.pkl', 'wb') as pkl:
    pickle.dump(iris_pickle, pkl)
    
#with open('iris_results.pkl', 'rb') as pkl:
#    some_dict = pickle.load(pkl)

In [None]:
def create_parser():
    parser = ArgumentParser(description='CoLT Experiment', formatter_class=RawTextHelpFormatter)
    dataset_help = '''
    Experiment to conduct. There are 4 designed and implemented.
    
    syn-lin: Synthetic Linearly-Separable
    syn-non: Synthetic Non Linearly-Separable
    iris: Iris (Linearly-Separable)
    skin: Skin/No Skin (Non Linearly-Separable)
    '''
    parser.add_argument('-e', '--experiment', type=str, help=dataset_help, required=True)
    lower_bound_help = '''
    Lower bounds per dimension to use when using a synthetic dataset. Shall be a list of float values.
    '''
    parser.add_argument('-l', '--lower_bounds', nargs='+', type=float, default=[-10, -10, -10, -10], help=lower_bound_help,)
    upper_bound_help = '''
    Upper bounds per dimension to use when using a synthetic dataset. Shall be a list of float values.
    '''
    parser.add_argument('-u', '--upper_bounds', nargs='+', type=float, default=[10, 10, 10, 10], help=upper_bound_help, )
    parser.add_argument('--bias', action='store_true', help='Flag for using bias.')
    parser.add_argument('--dataset_size', type=int, help='Number of datapoint to sample for dataset.', required=True)
    #
    parser.add_argument('-t', '--test_fraction', type=float, help='Fraction of whole dataset to use as testing.', default=0.2)
    parser.add_argument('-b', '--n_buckets', type=int, help='Number of buckets to split data into.', default=20)
    parser.add_argument('-r', '--n_runs', type=int, help='Number of times to repeat experiment.', default=10)
    #
    parser.add_argument('--eta', type=float, default=1, help='Learning rate of perceptron.' )
    parser.add_argument('--max_iter', type=int, default=1000, help='Maximum number of Perceptron iterations before convergance is assumed.')
    parser.add_argument('--w_init', nargs='+', type=float, default=[0.5, 0.5], help='Initial weight distribution [lower, upper] bounds.')
    # 
    parser.add_argument('-v', '--verbose',action='store_true', help='Verbosity of messages.' )
    parser.add_argument('-i', '--index', type=int, default=0, help='Inex of experiments. Helpful when running multiple repetitions of same experiment.')
    parser.add_argument('--result_root', type=str, default='.', help='Directory to store results.')
    return parser
    
def obtain_data(args):
    '''Return Data and Labels based on experiment to run'''
    
    experiment = args.experiment
    # Initial distribution of perceptron weights
    w_init_lows, w_init_highs = args.w_init
    
    if experiment == 'syn-lin' or experiment == 'syn-non': # Synthetic data experiment
        '''Generate Data'''
        # Lower and upper bounds for data distribution PER dimension
        # Bias only added if specified!
        lows      = args.lower_bounds + ([1] if args.bias else [])
        highs     = args.upper_bounds + ([1] if args.bias else [])
        assert len(lows) == len(highs), f"upper and lower bounds do not match: {lows} vs {highs}"
        ins       = len(lows) # 4 attributes, 1 bias
        data_size = args.dataset_size
        data   = sample_data(lows, highs, n_samples=data_size, seed=42)
        data   = np.array(data) # Helps in keeping bucket structure
        
        '''Select appropirate concept (linear or non-linear)'''
        if experiment == 'syn-lin': # Use a linearly-separable concept (a perceptron)
            rng      = np.random.default_rng(42) # For reproducibility
            W        = np.concatenate([ rng.uniform(w_init_lows, w_init_highs, (ins, 1)) ])
            truth    = pn.PocketPerceptron()
            truth.pi = truth.W = W
            # Concept is just a wrapper. Do truth.predict for same result
            concept  = Concept(truth) 
        
        elif experiment == 'syn-non': # Use a non-linearly-separable concept (an 'ins'-degree polynomial)
            truth    = NPolynomial(ins, w_init_lows, w_init_highs, 42)
            # Concept is just a wrapper. Do truth.predict for same result
            concept  = Concept(truth) 
        else:
            assert False, f"Invalid experiment selected: {experiment}"
        
        # Assign labels to sampled data
        labels = concept(data)

    elif experiment == 'iris':
        # sklearn's
        iris = datasets.load_iris()
        data = pd.DataFrame(iris.data)
        if args.bias:
            data['bias'] = 1
        targets = pd.DataFrame(iris.target)

        # Separate separable and non-separable flowers
        targets.replace(0, -1, inplace=True)
        targets.replace(1, 1, inplace=True)
        targets.replace(2, 1, inplace=True)

        data = data.to_numpy()
        labels = targets.to_numpy()
    
    elif experiment == 'skin':
        # dataset location manually selected (change if needed)
        data = arff.loadarff('./datasets/skinNoSkin.arff')
        skin = pd.DataFrame(data[0])
        if args.bias:
            skin['bias'] = 1

        # Data cleaning
        skin.replace(b'1', -1, inplace=True)
        skin.replace(b'2', 1, inplace=True)
        data = skin.drop('Class', axis=1).assign(bias=1)
        
        data = data.to_numpy()
        labels = skin.Class
    else:
        assert False, f"Invalid experiment selected: {experiment}"
    
    return data, labels

In [None]:
if __name__ == '__main__' and '__file__' in globals():
    # Do stuff ONLY if this is a script. Not Jupyter notebook.
    parser = create_parser()
    args = parser.parse_args()
    
    # Check if output file already exists  
    fname_out = f'{args.result_root}/{args.experiment}_{args.index}_results.pkl'
    if os.path.exists(fname_out):                                               
            # Results file does exist: exit                                     
            print("File %s already exists"%fname_out)                           
            exit() 
    
    # Select dataset to use.
    data, labels = obtain_data(args)
    
    # Perceptron learning hyper-parameters
    model_params = {
        'input'      : ins,
        'eta'        : args.eta,
        'max_iter'   : args.max_iter,
        'rand_seed'  : None,
        'ignore_flag': False,
    }
    
    # Experiment Execution
    history = perceptron_corruption_experiment(
        X               = data,
        y               = labels,
        test_size       = args.test_fraction,
        n_buckets       = args.n_buckets,
        model_params    = model_params,
        n_runs          = args.n_runs,
        seed            = 42,
        verbose         = args.verbose
    )
    
    # Save experiment
    pickle = {
        'history':    history,
        'n_data':     data.shape[0], # We know from data-set description. 
        'test_split': args.test_fraction,
        'n_runs':     args.n_runs,
        'n_buckets':  args.n_buckets,
        'max_iter':   args.max_iter,
        'n_attribs':  data.shape[1],
    }
    with open(fname_out, 'wb') as pkl:
        pickle.dump(pickle, pkl)
        
        



# Dataset Generation

In [153]:
# Experiment required functions
def sample_data(
    lows:      List[float],
    highs:     List[float],
    n_samples: int,
    seed:      int=None
) -> List[List[int]]:
    """Sample uniform distribution bounded by lows and highs
    
        Using a uniform distribution, perform sampling over the 
    distribution such that the space the distribution is sampling will 
    be bounded by the given bounds from the lows and highs. Lows and 
    highs will be arrays that contain the minimum and maximum values 
    per dimension on the data to be samples. For example, if we have 4 
    values in both lows and highs, then, at the time of sampling n_samples
    samples we will have n_samples of 4 attributes each: (n_samples, 4).
    """

    assert len(lows) == len(highs), f"Non-matching lows and highs: {len(lows) != {len(highs)}}"

    rng = np.random.default_rng(seed)
    data_shape = (n_samples, len(lows)) # See assertion #1
    data = rng.uniform(lows, highs, data_shape)
    return data

# splitting the dataset into bins can be done with: np.split(data, n_buckets)
# Recommend shuffling beforehand tho.

# splitting the dataset into bins can be done with: np.split(data, n_buckets)
# Recommend shuffling beforehand tho.

class Concept:
    """Label given data
    Using a model as truth, label given data.
    """
    def __init__(self, model):
        self.model = model

    def __call__(self, X):
        return self.model.solve(X)


class NPolynomial:

    def __init__(self,
                 n:    int,
                 low:  float=0,
                 high: float=1,
                 seed: int=42
                ):
        self.n     = n
        self.seed  = seed
        self.low   = low
        self.high  = high
        rng        = np.random.default_rng(seed)
        self.coeff = rng.uniform(low, high, (n, 1))
        self.exps  = [exp for exp in range(1, n+1)[::-1]]

    def solve(self, vals):
        var = np.power(vals, self.exps)
        activation = np.sign(var @ self.coeff)
        activation[activation == 0] = -1
        return activation



## Synthetic Linearly Separable 

In [231]:

'''Generate Data'''
# Lower and upper bounds for data distribution PER dimension
# Bias only added if specified!
lows      = [-10, -10, -10, -10, 1]
highs     = [10, 10, 10, 10, 1]

assert len(lows) == len(highs), f"upper and lower bounds do not match: {lows} vs {highs}"

ins          = len(lows) # 4 attributes, 1 bias
data_size    = 2_500
data         = sample_data(lows, highs, n_samples=data_size, seed=42)
data         = np.array(data) # Helps in keeping bucket structure
w_init_lows  = -10
w_init_highs = 10

rng      = np.random.default_rng(42) # For reproducibility
W        = np.concatenate([ rng.uniform(w_init_lows, w_init_highs, (ins, 1)) ])
truth    = pn.PocketPerceptron()
truth.pi = truth.W = W
# Concept is just a wrapper. Do truth.predict for same result
concept  = Concept(truth)


labels = concept(data)

In [232]:
syn_lin_data = {
    'X': data,
    'y': labels
}

with open('datasets/syn_lin_data.pkl', 'wb') as jar:
    pickle.dump(syn_lin_data, jar)

In [233]:
pd.DataFrame(syn_lin_data['y']).value_counts()

-1.0    1417
 1.0    1083
dtype: int64

In [238]:
with open('datasets/syn_lin_data.pkl', 'rb') as jar:
    pikl = pickle.load(jar)

In [241]:
pikl['X']

array([[ 5.47912097, -1.2224312 ,  7.1719584 ,  3.94736058,  1.        ],
       [ 9.51244703,  5.22279404,  5.72128611, -7.43772735,  1.        ],
       [-2.58403952,  8.53529978,  2.8773024 ,  6.45523227,  1.        ],
       ...,
       [ 3.07051319, -6.39545129, -1.25019812,  7.61164949,  1.        ],
       [-9.99750187, -4.84982576, -0.24915102,  6.42556087,  1.        ],
       [ 8.38126938, -0.88871137, -2.18369554, -5.37163567,  1.        ]])

## Synthetic Non-Linearly Separable

In [234]:
'''Generate Data'''
# Lower and upper bounds for data distribution PER dimension
# Bias only added if specified!
lows      = [-10, -10, -10, -10, 1]
highs     = [10, 10, 10, 10, 1]

assert len(lows) == len(highs), f"upper and lower bounds do not match: {lows} vs {highs}"

ins       = len(lows) # 4 attributes, 1 bias
data_size = 2_500
data   = sample_data(lows, highs, n_samples=data_size, seed=42)
data   = np.array(data) # Helps in keeping bucket structure
w_init_lows  = -10
w_init_highs = 10

truth    = NPolynomial(ins, w_init_lows, w_init_highs, 42)
# Concept is just a wrapper. Do truth.predict for same result
concept  = Concept(truth)

labels = concept(data)

In [235]:
syn_non_data = {
    'X': data,
    'y': labels
}

with open('datasets/syn_non_data.pkl', 'wb') as jar:
    pickle.dump(syn_non_data, jar)

In [236]:
pd.DataFrame(syn_non_data['y']).value_counts()

-1.0    1491
 1.0    1009
dtype: int64

## Skin Segmentation Dataset

In [3]:
skin = pd.read_csv('datasets/skinNoSkin.csv')

In [4]:
skin.replace(1, -1, inplace=True)
skin.replace(2, 1, inplace=True)
skin.Class.value_counts()

 1    194198
-1     50859
Name: Class, dtype: int64

In [222]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(
            n_splits=1,
            train_size=2500,
            random_state=42 # This way data is shuffled differently every run!
        )

X = skin.drop('Class', axis=1)
y = skin.Class

for train_i, test_i in sss.split(X, y):
            skin_data, skin_labels = X.iloc[train_i], y.iloc[train_i]
            test_data, test_labels = X.iloc[test_i], y.iloc[test_i]



In [224]:
skin_data['Class'] = skin_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skin_data['Class'] = skin_labels


In [228]:
skin_data.reset_index(drop=True)

Unnamed: 0,V1,V2,V3,Class
0,169.0,185.0,238.0,-1
1,180.0,177.0,132.0,1
2,63.0,68.0,23.0,1
3,170.0,166.0,118.0,1
4,127.0,167.0,220.0,-1
...,...,...,...,...
2495,86.0,131.0,212.0,-1
2496,141.0,140.0,89.0,1
2497,161.0,165.0,130.0,1
2498,179.0,176.0,131.0,1


In [229]:
skin_data.to_csv('datasets/skin_2500.csv')

In [5]:
skin

Unnamed: 0,V1,V2,V3,Class
0,74.0,85.0,123.0,-1
1,73.0,84.0,122.0,-1
2,72.0,83.0,121.0,-1
3,70.0,81.0,119.0,-1
4,70.0,81.0,119.0,-1
...,...,...,...,...
245052,163.0,162.0,112.0,1
245053,163.0,162.0,112.0,1
245054,163.0,162.0,112.0,1
245055,163.0,162.0,112.0,1


In [6]:
skin.to_csv('datasets/skinClean.csv')

In [230]:
!ls datasets/

data_banknote_authentication.csv  skin_2500.csv     skinNoSkin.arff
dkinNoSkin.csv			  Skin_NonSkin.txt  skinNoSkin.csv


# Data Generation