# Developing a Quality Assurance Classifier for Engineering Corps ;-)

In [143]:
#Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

## Load data into Pandas to understand what we are dealing with.

In [23]:
labels = pd.read_csv('labels.data', names=['labels', 'Date and Time'], sep=' ', infer_datetime_format=True)
measurements = pd.read_csv('measurements.data', header=None, sep=' ')
labels.head()

Unnamed: 0,labels,Date and Time
0,-1,19/07/2008 11:55:00
1,-1,19/07/2008 12:32:00
2,1,19/07/2008 13:17:00
3,-1,19/07/2008 14:43:00
4,-1,19/07/2008 15:22:00


### Above we have labels for failure as well as date and time for each reading. Successfully loaded in.

In [22]:
measurements.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


### It should be noted that:
1. There is 590 columns of sensor data here which does not correspond to the 591 different sensors stated in the brief sheet. Upon visual inspection of the data file I cannot see discrepancies in the way pandas has loaded it.
2. The data is currently expressed in many different scales and so feature scaling will be required, prior to any machine learning.
3. There are missing NaN values in the sensor data that must be accounted for.

In [29]:
print('The size of the measurements dataframe is ' + str(measurements.values.shape))
float_check = measurements.dtypes == np.float64
print('The size of float_check is ' + str(float_check.shape) + " and the sum of columns that are floats are " + str(float_check.sum()))
print('Meaning all sensor data in the measurements file is numeric and there are 1567 samples per sensor.')

The size of the measurements dataframe is (1567, 590)
The size of float_check is (590,) and the sum of columns that are floats are 590
Meaning all sensor data in the measurements file is numeric and there are 1567 samples per sensor.


In [33]:
print('The size of the labels dataframe is ' + str(labels.values.shape))
print('The data types contained in the labels dataframe is \n' + str(labels.dtypes))
print('The number of sensor samples in labels, matches the samples in measurements which is a good sanity check. It also means each timestamp corresponds to the sensor measurements.')

The size of the labels dataframe is (1567, 2)
The data types contained in the labels dataframe is 
labels            int64
Date and Time    object
dtype: object
The number of sensor samples in labels, matches the samples in measurements which is a good sanity check. It also means each timestamp corresponds to the sensor measurements.


In [37]:
print('Missing values = ' + str(measurements.isnull().sum(axis = 0).sum()))
print('Total values = ' + str(measurements.values.size))
percentage_missing = 100*measurements.isnull().sum(axis = 0).sum()/measurements.values.size
print('Percentage values missing = ' + str(percentage_missing) + '%')

Missing values = 41951
Total values = 924530
Percentage values missing = 4.537548808583821%


## Note:
1. 4.5% of all the data is missing.
2. I will make an assumption to replace this missing data with the mean values in each column.
3. I will do this after mean normalisation so as to not affect the standard deviation.
4. I believe this will not affect the results too much because I will be mean-normalising the data anyway.

# Mean Normalisation

In [40]:
sensor_data = measurements.values.copy()

In [56]:
def mean_normalisation(X):
    mu = np.nanmean(X, axis=0)
    sigma = np.nanstd(X, axis=0)
    X_norm = (X[:,]-mu)/sigma
    return mu, sigma, X_norm

In [89]:
mu, sigma, sensor_data_norm = mean_normalisation(sensor_data)

# Setting nan values to zero now that the data has been feature scaled. (Equivalent to setting to mean) 
nan_rows, nan_cols = np.where(np.isnan(sensor_data_norm))
sensor_data_norm[nan_rows, nan_cols]=0
print(sensor_data_norm)

[[ 0.22387917  0.8478245  -0.43431977 ...  0.          0.
   0.        ]
 [ 1.10501484 -0.38205392  1.01258264 ...  0.41172174  0.25004455
   1.15631999]
 [-1.11202304  0.79731564 -0.47913456 ...  3.62590582  3.32035899
  -0.17909141]
 ...
 [-0.48429031 -1.44398414  0.19498206 ... -0.89439547 -0.97110324
  -0.59818675]
 [-1.62412795  0.44984948 -0.79698678 ...  0.91145354  0.7733936
  -0.06579841]
 [-0.94476352 -0.56094985 -0.17295887 ... -0.03122236 -0.2733045
   0.40606805]]


  after removing the cwd from sys.path.


# Extract and process labels

In [67]:
sensor_labels = labels['labels'].values.copy()
sensor_labels[sensor_labels>0] = 1
sensor_labels[sensor_labels<0] = 0
print(sensor_labels)
success_rate = 100*sensor_labels.sum()/sensor_labels.size
print('%success = ' + str(success_rate))

[0 0 1 ... 0 0 0]
%success = 6.636885768985322


## Note:
1. Labelled data is very skewed, meaning very few success rates. This will be hard to train with a typical classifier, also must take extra care when assessing precision and recall.
2. Could be worth removing successes and training an anomally detection algorithm instead with success as the anomaly.

# Start with a Supervised Learning Problem
. Lets split the data set into train, test and cross validation, ensuring there is an even proportion of successes in each set.

In [91]:
# Shuffle the sensor data and labels, note you are ignoring effects of time.
# idx = np.arange(sensor_data_norm.shape[0])
# np.random.shuffle(idx)
# sensor_data_norm_shuffled = sensor_data_norm[idx, :]
# sensor_labels_shuffled = sensor_labels[idx]

# success_locs = np.where(sensor_labels_shuffled == 1)
# failure_locs = np.where(sensor_labels_shuffled == 0)
# success_locs = success_locs[0]
# failure_locs = failure_locs[0]



In [132]:
# num_success_trainset = np.floor(success_locs.shape[0]*0.6).astype(int)
# num_success_cross_val = np.floor(success_locs.shape[0]*0.2).astype(int)

# num_failure_trainset = np.floor(failure_locs.shape[0]*0.6).astype(int)
# num_failure_cross_val = np.floor(failure_locs.shape[0]*0.2).astype(int)
# print(num_success_trainset + num_failure_trainset)
# print(failure_locs[:num_failure_trainset].shape)

939
(877,)


In [138]:
# trainset_X = sensor_data_norm_shuffled[success_locs[:num_success_trainset], :]
# #print(sensor_data_norm_shuffled[failure_locs[:num_failure_trainset], :].shape)
# trainset_X = np.vstack((trainset_X, sensor_data_norm_shuffled[failure_locs[:num_failure_trainset], :]))
# print(trainset_X.shape)

(939, 590)


In [142]:
# end_cross_val = num_success_cross_val+num_success_trainset
# cross_val_X = sensor_data_norm_shuffled[success_locs[num_success_trainset:end_cross_val], :]
# cross_val_X = np.vstack((cross_val_X, sensor_data_norm_shuffled[failure_locs[num_failure_cross:end_cross_val],:]))
# print(cross_val_X.shape)

(20, 590)


In [None]:
# testset_X = sensor_data_norm_shuffled[success_locs[:num_success_cross_val], :]