In [1]:
def loadData(dataset):
    """
    Generic function to read various file formats. Currently supports XLS(X), CSV and TSV. Can read through the
    compressed formats as well.

    :param dataset:
    :return: the dataset as pandas DataFrame
    """
    # TODO pass in parameters to allow for the fractional dataset.

    fileName, fileExtension = os.path.splitext(dataset)
    if fileExtension in ['.xlsx', '.xls']:
        data = pd.read_excel(dataset).sample(frac=1.0, replace=False, random_state=7388)
    elif fileExtension in ['.csv']:
        data = pd.read_csv(dataset).sample(frac=1., replace=False, random_state=7388, encoding='ISO-8859-1', low_memory=False)
    elif fileExtension in ['.tsv']:
        data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=7388)
    else:
        raise (NotImplementedError)

    return (data)

In [2]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    Parameters
    ----------
    :param actual: Array containing the y_true target classes
    :param predicted: Matrix with class predictions, one probability per class
    
    Returns
    -------
    fitness: float
        Returns a float value 
    """
    # Convert 'y_true' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [3]:
def balanced_accuracy_score(y_true, y_pred):
    """Default scoring function: balanced accuracy
    Balanced accuracy computes each class' accuracy on a per-class basis using a
    one-vs-rest encoding, then computes an unweighted average of the class accuracies.
    Parameters
    ----------
    y_true: numpy.ndarray {n_samples}
        Array containing the y_true target classes
    y_pred: numpy.ndarray {n_samples}
        Matrix with class predictions, one probability per class
    Returns
    -------
    fitness: float
        Returns a float value indicating the `individual`'s balanced accuracy
        0.5 is as good as chance, and 1.0 is perfect predictive accuracy
    """
    all_classes = list(set(np.append(y_true, y_pred)))
    all_class_accuracies = []
    for this_class in all_classes:
        try:
            this_class_sensitivity = \
                float(sum((y_pred == this_class) & (y_true == this_class))) / \
                float(sum((y_true == this_class)))
        except:
            this_class_sensitivity = 0.
        try:
            this_class_specificity = \
                float(sum((y_pred != this_class) & (y_true != this_class))) / \
                float(sum((y_true != this_class)))
        except:
            this_class_specificity = 0.

        this_class_accuracy = (this_class_sensitivity + this_class_specificity) / 2.
        all_class_accuracies.append(this_class_accuracy)

    return np.mean(all_class_accuracies)

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(X.astype(str), 
                                                  y.astype(str), 
                                                  random_state=7388, 
                                                  test_size=0.1, 
                                                  shuffle=True)