### Defining some basic information about the dataset

In [49]:
import pandas as pd
import glob
from random import randint
import numpy as np

# Remove columns that have a STD less than
MINIMUM_STD = 0.00001

# Define columns
DF_COLUMNS = ["ENGINE_NUMBER", "TIME_IN_CYCLES"] + \
             ["OPERATIONAL_SETTING_{}".format(x) for x in range(1,4)] + \
             ["SENSOR_MEASUREMENT_{}".format(x) for x in range(1,24)]

# Define data paths and data names
DATA_PATH = "/home/ec2-user/SageMaker/aws-sagemaker-test/data/"
DS_FILENAME = DATA_PATH + "{}_FD00{}.txt"


### Defining funcitons to load data and to filter out columns where STD is less than MINIMUM_STD

In [59]:
# Functions to load some data
def load_data(data_path, filter_data=False):  
    """
    Load data in
    """
    data = pd.read_csv(data_path, sep=' ', header=None, names=DF_COLUMNS)
    data = data.drop(DF_COLUMNS[-2:], axis=1)
#     data['TIME'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='600s')
    if filter_data:
        data = drop_bad_columns(data)
    return data

def load_rul(data_path):  
    df = pd.read_csv(data_path, header=None, names=['RUL'])
    df['ENGINE_NUMBER'] = np.arange(1, len(df) + 1)
    return df

# Function to filter data that doesn't meet a certain criteria
def drop_bad_columns(dataframe):
    """
    Remove columns where the STD is less than MINIMUM_STD (only sensor data... not settings)
    """
    df = dataframe.describe().T.reset_index()
    for _,data in df.iterrows():
        if abs(data['std']) <= MINIMUM_STD and 'SENSOR' in data['index']:
            del dataframe[data['index']]
    return dataframe.reset_index(drop=True)

### Load train, test, RUL data for dataset 1 and 2

In [60]:
# Load dataset 1
train001 = load_data(DS_FILENAME.format('train', '1'), filter_data=True)
test001 = load_data(DS_FILENAME.format('test', '1'), filter_data=True)
rul001 = load_rul(DS_FILENAME.format('RUL', '1'))

# Load dataset 2
train002 = load_data(DS_FILENAME.format('train', '2'), filter_data=True)
test002 = load_data(DS_FILENAME.format('test', '2'), filter_data=True)
rul002 = load_rul(DS_FILENAME.format('RUL', '2'))

In [61]:
train001.head()

Unnamed: 0,ENGINE_NUMBER,TIME_IN_CYCLES,OPERATIONAL_SETTING_1,OPERATIONAL_SETTING_2,OPERATIONAL_SETTING_3,SENSOR_MEASUREMENT_2,SENSOR_MEASUREMENT_3,SENSOR_MEASUREMENT_4,SENSOR_MEASUREMENT_6,SENSOR_MEASUREMENT_7,SENSOR_MEASUREMENT_8,SENSOR_MEASUREMENT_9,SENSOR_MEASUREMENT_11,SENSOR_MEASUREMENT_12,SENSOR_MEASUREMENT_13,SENSOR_MEASUREMENT_14,SENSOR_MEASUREMENT_15,SENSOR_MEASUREMENT_17,SENSOR_MEASUREMENT_20,SENSOR_MEASUREMENT_21
0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442
3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,21.61,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044


In [65]:
train001[''ENGINE_NUMBER'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'value_counts'