### Defining some basic information about the dataset

In [91]:
import pandas as pd
import glob
from random import randint
import numpy as np
import boto3

# Remove columns that have a STD less than
MINIMUM_STD = 0.00001

# Define columns
DF_COLUMNS = ["ENGINE_NUMBER", "TIME_IN_CYCLES"] + \
             ["OPERATIONAL_SETTING_{}".format(x) for x in range(1,4)] + \
             ["SENSOR_MEASUREMENT_{}".format(x) for x in range(1,24)]

# Define data paths and data names
DATA_PATH = "/home/ec2-user/SageMaker/aws-sagemaker-test/data/"
OUTDATA_PATH = "/home/ec2-user/SageMaker/aws-sagemaker-test/engine_data/"
DS_FILENAME = DATA_PATH + "{}_FD00{}.txt"


### Defining funcitons to load data and to filter out columns where STD is less than MINIMUM_STD

In [59]:
# Functions to load some data
def load_data(data_path, filter_data=False):  
    """
    Load data in
    """
    data = pd.read_csv(data_path, sep=' ', header=None, names=DF_COLUMNS)
    data = data.drop(DF_COLUMNS[-2:], axis=1)
#     data['TIME'] = pd.date_range('1/1/2000', periods=data.shape[0], freq='600s')
    if filter_data:
        data = drop_bad_columns(data)
    return data

def load_rul(data_path):  
    df = pd.read_csv(data_path, header=None, names=['RUL'])
    df['ENGINE_NUMBER'] = np.arange(1, len(df) + 1)
    return df

# Function to filter data that doesn't meet a certain criteria
def drop_bad_columns(dataframe):
    """
    Remove columns where the STD is less than MINIMUM_STD (only sensor data... not settings)
    """
    df = dataframe.describe().T.reset_index()
    for _,data in df.iterrows():
        if abs(data['std']) <= MINIMUM_STD and 'SENSOR' in data['index']:
            del dataframe[data['index']]
    return dataframe.reset_index(drop=True)

### Load train, test, RUL data for dataset 1 and 2

In [159]:
# Load dataset 1
train001 = load_data(DS_FILENAME.format('train', '1'), filter_data=True)
test001 = load_data(DS_FILENAME.format('test', '1'), filter_data=True)
rul001 = load_rul(DS_FILENAME.format('RUL', '1'))

# Load dataset 2
train002 = load_data(DS_FILENAME.format('train', '2'), filter_data=True)
test002 = load_data(DS_FILENAME.format('test', '2'), filter_data=True)
rul002 = load_rul(DS_FILENAME.format('RUL', '2'))

### Feature engineering on training

In [160]:
def compute_rul(df):
    # Get max cycle per engine
    max_cycle = df.groupby('ENGINE_NUMBER').TIME_IN_CYCLES.max().reset_index()
    max_cycle.columns = ['ENGINE_NUMBER', 'MAX_CYCLES']
    
    # Merge onto data and reorganize the columns
    df = pd.merge(df, max_cycle, on='ENGINE_NUMBER')
    df['RUL'] = df.MAX_CYCLES - df.TIME_IN_CYCLES
    del df['MAX_CYCLES']
    df = df[['RUL'] + [col for col in df.columns if col != 'RUL']]
    del df['ENGINE_NUMBER']
    return df


train001 = compute_rul(train001)
train002 = compute_rul(train002)

### Feature engineering on testing

In [161]:
def combine_test_rul(test, rul):
    # Only keep last row per engine number
    df = test[test.groupby('ENGINE_NUMBER')['TIME_IN_CYCLES'].transform(max) == test['TIME_IN_CYCLES']]
    df = pd.merge(df, rul, on='ENGINE_NUMBER')
    df = df[['RUL'] + [col for col in df.columns if col != 'RUL']]
    del df['ENGINE_NUMBER']
    return df

test001 = combine_test_rul(test001, rul001)
test002 = combine_test_rul(test002, rul002)

### Write out new data to s3

In [163]:
bucket = 'bryan-predictive-maintenance'


def write_to_csv(df, fname):
    # Change column order and save file locally
    df.to_csv(fname, index=False, header=False)
    
    # Create connection
    s3conn = boto3.client('s3')
    
    # Write file
    outfile = 'sagemaker/{}'.format(fname.split('/')[-1])
    s3conn.put_object(
            Body=open(fname),
            Bucket=bucket,
            Key=outfile
        )
    
write_to_csv(train001, OUTDATA_PATH + 'train001.csv')
write_to_csv(train002, OUTDATA_PATH + 'train002.csv')
write_to_csv(test001, OUTDATA_PATH + 'test001.csv')   
write_to_csv(test002, OUTDATA_PATH + 'test002.csv')   