In [None]:
import sklearn.preprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
X_train = pd.read_csv(r"data/training_dataset.csv", sep=";")
Y_train = pd.read_csv(r"data/training_solution.csv", sep=";", names=X_train.columns)

X_test = pd.read_csv(r"data/test_dataset.csv", sep=";")

assert np.array_equal(X_train.columns, X_test.columns)
cols = X_train.columns.copy()

## RQ: How can the data be described?

Check fivenum values for each feature

In [None]:
X_train.describe()  # train set

In [None]:
Y_train.sum(axis=0)  # number of failure labelings per feature

In [None]:
X_test.describe()  # test set

## RQ: Is the test data sampled from the same distribution?

Compare raw data and histograms to see if the features seem to be drawn from the same distribution

In [None]:
# compare train and test histograms
for col in X_train.columns:
    plt.figure(figsize=(25, 5))

    # (1) histogram
    plt.subplot(1, 2, 1, label="histogram")

    bins = np.histogram_bin_edges(np.concatenate([X_train[col], X_test[col]]), bins=50)
    plt.hist(X_train[col], alpha=0.5, bins=bins, label="train")
    plt.hist(X_test[col], alpha=0.5, bins=bins, label="test")
    
    plt.legend()
    plt.title("{} histogram".format(col))

    # (2) raw values
    plt.subplot(1, 2, 2, label="raw values")

    plt.plot(X_train[col], label="train")        
    plt.plot(X_test[col], label="test")

    errorband = np.where(Y_train[col])[0]
    if len(errorband) > 0:
        plt.fill_between(errorband, np.zeros_like(errorband), X_train[col][errorband],
                         facecolor='yellow', alpha=0.5, label="failures")

    plt.legend()
    plt.title("{} raw values".format(col))
    
    plt.show()

## RQ: Are the features correlated?

Plot the raw train and test values to visually check for correlations

In [None]:
def visualize_correlation(x, name=""):
    plt.figure(figsize=(35, 15))
    for i, col in enumerate(x.columns):
        plt.plot(sklearn.preprocessing.minmax_scale(x[col]) + i + 0.5, label=col)

    plt.gca().set_yticks(np.arange(1, len(x.columns) + 1))
    plt.gca().set_xticks(np.arange(len(x), step=20))
    plt.gca().set_axisbelow(True)
    plt.grid(axis="x")
    plt.legend()
    plt.title("feature correlation {}".format(name))
    plt.show()
    
visualize_correlation(X_train, "train")
visualize_correlation(X_test, "test")

In [None]:
def visualize_correlation_coefficients(x, name=""):
    plt.figure(figsize=(10, 10))
    im = plt.imshow(np.abs(x.corr()), cmap="viridis")
    
    ticks = np.arange(0, len(x.columns))
    labels = ticks + 1
    plt.gca().set_xticks(ticks)
    plt.gca().set_yticks(ticks)
    plt.gca().set_xticklabels(labels)
    plt.gca().set_yticklabels(labels)
    
    plt.title("absolute feature correlation {}".format(name))
    plt.colorbar(im)
    plt.show()
    
visualize_correlation_coefficients(X_train, "train")
visualize_correlation_coefficients(X_test, "test")