Skip to content

Commit

Permalink
IGNITE-12168: [ML] Flaky ML example tests (#6866)
Browse files Browse the repository at this point in the history
  • Loading branch information
zaleslaw committed Sep 13, 2019
1 parent ee995b4 commit 4b8b7ff
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 34 deletions.
Expand Up @@ -22,50 +22,50 @@
*/
public enum MLSandboxDatasets {
/** Movielens dataset with ratings. */
MOVIELENS("ratings.csv", true, ","),
MOVIELENS("modules/ml/src/main/resources/datasets/ratings.csv", true, ","),

/** The full Iris dataset from Machine Learning Repository. */
IRIS("iris.txt", false, "\t"),
IRIS("modules/ml/src/main/resources/datasets/iris.txt", false, "\t"),

/** The Titanic dataset from Kaggle competition. */
TITANIC("titanic.csv", true, ";"),
TITANIC("modules/ml/src/main/resources/datasets/titanic.csv", true, ";"),

/** The 1st and 2nd classes from the Iris dataset. */
TWO_CLASSED_IRIS("two_classed_iris.csv", false, "\t"),
TWO_CLASSED_IRIS("modules/ml/src/main/resources/datasets/two_classed_iris.csv", false, "\t"),

/** The dataset is about different computers' properties based on https://archive.ics.uci.edu/ml/datasets/Computer+Hardware. */
CLEARED_MACHINES("cleared_machines.csv", false, ";"),
CLEARED_MACHINES("modules/ml/src/main/resources/datasets/cleared_machines.csv", false, ";"),

/**
* The health data is related to death rate based on; doctor availability, hospital availability,
* annual per capita income, and population density people per square mile.
*/
MORTALITY_DATA("mortalitydata.csv", false, ";"),
MORTALITY_DATA("modules/ml/src/main/resources/datasets/mortalitydata.csv", false, ";"),

/**
* The preprocessed Glass dataset from the Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Glass+Identification
* There are 3 classes with labels: 1 {building_windows_float_processed}, 3 {vehicle_windows_float_processed}, 7 {headlamps}.
* Feature names: 'Na-Sodium', 'Mg-Magnesium', 'Al-Aluminum', 'Ba-Barium', 'Fe-Iron'.
*/
GLASS_IDENTIFICATION("glass_identification.csv", false, ";"),
GLASS_IDENTIFICATION("modules/ml/src/main/resources/datasets/glass_identification.csv", false, ";"),

/** The Wine recognition data. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/wine/">here</a>. */
WINE_RECOGNITION("wine.txt", false, ","),
WINE_RECOGNITION("modules/ml/src/main/resources/datasets/wine.txt", false, ","),

/** The Boston house-prices dataset. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/housing/">here</a>. */
BOSTON_HOUSE_PRICES("boston_housing_dataset.txt", false, ","),
BOSTON_HOUSE_PRICES("modules/ml/src/main/resources/datasets/boston_housing_dataset.txt", false, ","),

/** Example from book Barber D. Bayesian reasoning and machine learning. Chapter 10. */
ENGLISH_VS_SCOTTISH("english_vs_scottish_binary_dataset.csv", true, ","),
ENGLISH_VS_SCOTTISH("modules/ml/src/main/resources/datasets/english_vs_scottish_binary_dataset.csv", true, ","),

/** Wholesale customers dataset. Could be found <a href="https://archive.ics.uci.edu/ml/datasets/Wholesale+customers">here</a>. */
WHOLESALE_CUSTOMERS("wholesale_customers.csv", true, ","),
WHOLESALE_CUSTOMERS("modules/ml/src/main/resources/datasets/wholesale_customers.csv", true, ","),

/** Fraud detection problem [part of whole dataset]. Could be found <a href="https://www.kaggle.com/mlg-ulb/creditcardfraud/">here</a>. */
FRAUD_DETECTION("fraud_detection.csv", false, ","),
FRAUD_DETECTION("modules/ml/src/main/resources/datasets/fraud_detection.csv", false, ","),

/** A dataset with discrete and continious features. */
MIXED_DATASET("mixed_dataset.csv", true, ",");
MIXED_DATASET("modules/ml/src/main/resources/datasets/mixed_dataset.csv", true, ",");

/** Filename. */
private final String filename;
Expand Down
Expand Up @@ -17,6 +17,7 @@

package org.apache.ignite.ml.util;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Paths;
Expand All @@ -31,13 +32,10 @@
import org.apache.ignite.IgniteCache;
import org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction;
import org.apache.ignite.configuration.CacheConfiguration;
import org.apache.ignite.internal.util.typedef.internal.A;
import org.apache.ignite.internal.util.IgniteUtils;
import org.apache.ignite.ml.math.exceptions.knn.FileParsingException;
import org.apache.ignite.ml.math.primitives.vector.Vector;
import org.apache.ignite.ml.math.primitives.vector.VectorUtils;
import org.springframework.core.io.Resource;
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
import org.springframework.core.io.support.ResourcePatternResolver;

/**
* Common utility code used in some ML examples to set up test cache.
Expand All @@ -46,10 +44,6 @@ public class SandboxMLCache {
/** */
private final Ignite ignite;

/** Resource resolver. */
private static final ResourcePatternResolver RESOURCE_RESOLVER =
new PathMatchingResourcePatternResolver(SandboxMLCache.class.getClassLoader());

/** */
public SandboxMLCache(Ignite ignite) {
this.ignite = ignite;
Expand All @@ -74,6 +68,7 @@ public IgniteCache<Integer, double[]> fillCacheWith(double[][] data) {
return cache;
}


/**
* Loads dataset as a list of rows.
*
Expand All @@ -84,10 +79,15 @@ public IgniteCache<Integer, double[]> fillCacheWith(double[][] data) {
public List<String> loadDataset(MLSandboxDatasets dataset) throws IOException {
List<String> res = new ArrayList<>();

Resource[] resources = RESOURCE_RESOLVER.getResources("classpath*:*/" + dataset.getFileName());
A.ensure(resources.length == 1, "Cannot find resource");
String fileName = dataset.getFileName();

File file = IgniteUtils.resolveIgnitePath(fileName);

if (file == null)
throw new FileNotFoundException(fileName);

Scanner scanner = new Scanner(file);

Scanner scanner = new Scanner(resources[0].getInputStream());
if (dataset.hasHeader() && scanner.hasNextLine())
scanner.nextLine();

Expand All @@ -99,26 +99,31 @@ public List<String> loadDataset(MLSandboxDatasets dataset) throws IOException {
return res;
}


/**
* Fills cache with data and returns it.
*
* @param dataset The chosen dataset.
* @return Filled Ignite Cache.
* @throws FileNotFoundException If file not found.
*/
public IgniteCache<Integer, Vector> fillCacheWith(MLSandboxDatasets dataset) throws IOException {
public IgniteCache<Integer, Vector> fillCacheWith(MLSandboxDatasets dataset) throws FileNotFoundException {

IgniteCache<Integer, Vector> cache = getCache();

String fileName = dataset.getFileName();
Resource[] resources = RESOURCE_RESOLVER.getResources("classpath*:*/" + fileName);
A.ensure(resources.length == 1, "Cannot find resource");

Scanner scanner = new Scanner(resources[0].getInputStream());
File file = IgniteUtils.resolveIgnitePath(fileName);

if (file == null)
throw new FileNotFoundException(fileName);

Scanner scanner = new Scanner(file);

int cnt = 0;
while (scanner.hasNextLine()) {
String row = scanner.nextLine();
if (dataset.hasHeader() && cnt == 0) {
if(dataset.hasHeader() && cnt == 0) {
cnt++;
continue;
}
Expand All @@ -129,11 +134,9 @@ public IgniteCache<Integer, Vector> fillCacheWith(MLSandboxDatasets dataset) thr
NumberFormat format = NumberFormat.getInstance(Locale.FRANCE);

for (int i = 0; i < cells.length; i++)
try {
if (cells[i].equals(""))
data[i] = Double.NaN;
else
data[i] = Double.valueOf(cells[i]);
try{
if(cells[i].equals("")) data[i] = Double.NaN;
else data[i] = Double.valueOf(cells[i]);
} catch (java.lang.NumberFormatException e) {
try {
data[i] = format.parse(cells[i]).doubleValue();
Expand Down

0 comments on commit 4b8b7ff

Please sign in to comment.