In [167]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
import io
import json
import boto3
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Imputer

class RemoveNA(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.created = True
    def fit(self, X, y=None):
        return self #Don't know what this is for right now
    def transform(self, X, y=None):
        X = X.dropna(axis=0, how='any')
        return X

class StandardScaleDF(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()
    def fit(self, X, y=None):
        return self #Don't know what this is for right now
    def transform(self, X, y=None):
        X.loc[:,'Scaled'] = self.scaler.fit_transform(X['Value'].values.reshape(-1, 1))
        return X

class RemoveExtraHeaders(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pattern = '^Value'
    def fit(self, X, y=None):
        return self #Don't know what this is for right now
    def transform(self, X, y=None):
        for tag in X.columns:
            X.loc[:,'Value'] = X['Value'].apply(lambda x: self.convertState(x))
        return X
    def convertState(self, value):
        if not pd.isnull(value):
            result = re.match(self.pattern, str(value))
            if result:
                return np.nan
        return value

class RemoveSystemStates(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pattern = '^State: (\d+).+'
    def fit(self, X, y=None):
        return self #Don't know what this is for right now
    def transform(self, X, y=None):
        for tag in X.columns:
            X.loc[:, 'Value'] = X['Value'].apply(lambda x: self.convertState(x))
        return X
    def convertState(self, value):
        if not pd.isnull(value):
            result = re.match(self.pattern, str(value))
            if result:
                return np.nan
        return value

class SampleRawTimeSeries(BaseEstimator, TransformerMixin):
    def __init__(self, sample_rate):
        self.sample_rate = sample_rate
    def fit(self, X, y=None):
        return self #Don't know what this is for right now
    def transform(self, X, y=None):
        X.loc[:, 'TimeStamp'] = pd.to_datetime(X['TimeStamp'])
        X.loc[:, 'Value'] = pd.to_numeric(X['Value'])
        resampled_data = X.resample(self.sample_rate, on='TimeStamp').mean()
        resampled_data.interpolate(method='linear', inplace=True)
        return resampled_data

class NewIndexDF(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.created = True
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.reset_index()  
    
class ChangeColtoIndexDF(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.set_index(self.column)


sample_pipeline = Pipeline([
    ('removeStates', RemoveSystemStates()),
    ('removeHeaders', RemoveExtraHeaders()),
    ('removeNA', RemoveNA()),
    ('sample', SampleRawTimeSeries(sample_rate='20T')),
    ('newIndex', NewIndexDF()),
])

scale_pipeline = Pipeline([
    ('changeIndex', ChangeColtoIndexDF('TimeStamp')),
    ('scaleDF', StandardScaleDF()),
])

def sample_data(filename):
    data = pd.read_csv(os.path.normpath(filename))
    data = sample_pipeline.fit_transform(data)
    return data


file = "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\\18VE3305B2.csv"

sampledData = sample_data(file)

numPeriods = 2050

goodPeriods = [
    "1/13/15 00:00",
    "3/24/15 00:00",
    "7/14/16 23:00",
    "11/2/16 01:00",
    "4/19/17 00:00"
]

badPeriods = [
    "10/13/15 00:00",
    "11/16/15 00:00",
    "5/6/16 00:00"
]

def GenerateDataSet(periodStart, numPeriods, sampledData):
    indexNum = sampledData[sampledData['TimeStamp'] == periodStart].index[0]
    return scale_pipeline.fit_transform(sampledData.loc[indexNum:indexNum + numPeriods - 1])

goodData = {}
goodCounter = 0
badData = {}
badCounter = 0

for period in goodPeriods:
    goodData[period] = GenerateDataSet(period, numPeriods, sampledData)
    goodData[period].to_csv("C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized" + str(goodCounter) + ".csv")
    goodCounter += 1
    
for period in badPeriods:
    badData[period] = GenerateDataSet(period, numPeriods, sampledData)
    badData[period].to_csv("C:\DataScience\predictive-maintenance\DataExtracts\PumpData\BadNormalized" + str(badCounter) + ".csv")
    badCounter += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [67]:
import datetime
import pandas as pd
import numpy as np
import re
import itertools
import os
import io
import json
import boto3
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.metrics.pairwise import euclidean_distances
from collections import Counter

dataFiles = {
    'Good': [
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized0.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized1.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized2.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized3.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized4.csv"
    ],
    'Bad': [
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\BadNormalized0.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\BadNormalized1.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\BadNormalized2.csv"
    ]
}

observationStringLength = 4
SAXcharacters = {
                    "a": 4.5, 
                    "b": 2.5,
                    "c": 1.5,
                    "d": .5,
                    "e": 0,
                    "f": -.75,
                    "g": -1.75,
                    "h": False
                }

def GenerateSparseDF(SAXcharacters, observationStringLength):
    possibilities = []
    for combination in itertools.product(SAXcharacters, repeat=observationStringLength):
        possibilities.append(''.join(map(str, combination)))
    sparseDF = pd.DataFrame(index=possibilities)
    return sparseDF

def chunkify(lst,n):
    return [ lst[i::n] for i in range(n) ]

def SAXconvert(lst, SAXcharacters):
    saxString = ''
    #This is custom SAX threshold logic. I need to figure out a way to make this configurable somehow.
    for i in lst:
        for char in SAXcharacters:
            tempFlag = True
            if SAXcharacters[char]:
                if i > SAXcharacters[char]:
                    saxString += char
                    tempFlag = False
                    break
        if tempFlag:
            saxString += char
    return saxString

def GetSAXCounts(data, observationStringLength, SAXcharacters):
    newLength = (len(data)-(len(data)%observationStringLength))
    data = data.loc[0:newLength]
    numChunks = int(len(data)/observationStringLength)
    chunks = chunkify(data['Scaled'], numChunks)
    SAXStringCounts = pd.DataFrame.from_dict(Counter([ SAXconvert(chunk, SAXcharacters) for chunk in chunks ]), orient='index')
    return SAXStringCounts

sparseSAXMatrix = GenerateSparseDF(SAXcharacters, observationStringLength)

for classifier in dataFiles:
    iterator = 0
    for file in dataFiles[classifier]:
        data = pd.read_csv(file)
        SAXdf = GetSAXCounts(data, observationStringLength, SAXcharacters)
        SAXdf.rename(index=str, columns={0:classifier + str(iterator)}, inplace=True)
        sparseSAXMatrix = sparseSAXMatrix.merge(SAXdf, how='left', left_index=True, right_index=True)
        iterator += 1

sparseSAXMatrix.fillna(0, inplace=True)

distances = euclidean_distances(sparseSAXMatrix.transpose(), sparseSAXMatrix.transpose())

In [44]:
distances = np.concatenate(distances).tolist()

In [69]:
distances[-1]

array([  71.28814768,   56.05354583,   48.62098312,   55.15432893,
         51.82663408,   73.67496183,  129.73819792,    0.        ])

In [41]:
distances.remove(0.0)

In [46]:
np.mean(list(filter(lambda a: a != 0, distances)))

76.98744688324031

In [48]:
np.std(list(filter(lambda a: a != 0, distances)))

41.92600821992712

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
class BagOShapesClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self):
        self.dummy = "dummy"

    #def check_X(self, X):
        #Is this in the correct format?
        #The inputs need to have been individually changed so that the mean is 0 and the std dev is 1

    def GenerateSparseDF(self):
        possibilities = []
        for combination in itertools.product(list(self.SAXparameters.keys()), repeat=self.SAXstringlength):
            possibilities.append(''.join(map(str, combination)))
        sparseDF = pd.DataFrame(index=possibilities)
        return sparseDF

    def chunkify(self, lst, n):
        return [ lst[i::n] for i in range(n) ]

    def SAXconvert(self, lst):
        saxString = ''
        #This is custom SAX threshold logic. I need to figure out a way to make this configurable somehow.
        for i in lst:
            for char in self.SAXparameters:
                tempFlag = True
                if self.SAXparameters[char]:
                    if i > self.SAXparameters[char]:
                        saxString += char
                        tempFlag = False
                        break
            if tempFlag:
                saxString += char
        return saxString

    def GetSAXCounts(self, data):
        newLength = (len(data)-(len(data)%self.SAXstringlength))
        data = data.loc[0:newLength]
        numChunks = int(len(data)/self.SAXstringlength)
        chunks = chunkify(data['Scaled'], numChunks)
        SAXStringCounts = pd.DataFrame.from_dict(Counter([ self.SAXconvert(chunk) for chunk in chunks ]), orient='index')
        return SAXStringCounts

    def coefCalculation(self):
        zeroDistances = euclidean_distances(self.sparseSAXMatrix.transpose(), self.sparseSAXMatrix.transpose())
        self.test = zeroDistances
        zeroDistances = np.concatenate(zeroDistances).tolist()
        self.distances = list(filter(lambda a: a != 0, zeroDistances))
        return np.mean(self.distances), np.std(self.distances)

    def fit(self, X_list, SAXparameters, SAXstringlength, std_devSensitivity):
        #X = self.check_X(X)
        self.SAXparameters = SAXparameters
        self.SAXstringlength = SAXstringlength
        self.sensitivity = std_devSensitivity
        self.Xlist_ = X_list
        self.sparseSAXMatrix = self.GenerateSparseDF()
        iterator = 0
        for timeSeries in self.Xlist_:
            SAXdf = self.GetSAXCounts(timeSeries)
            SAXdf.rename(index=str, columns={0:str(iterator)}, inplace=True)
            self.sparseSAXMatrix = sparseSAXMatrix.merge(SAXdf, how='left', left_index=True, right_index=True)
            iterator += 1

        self.sparseSAXMatrix.fillna(0, inplace=True) 
        self.avg_codistance, self.std_dev = self.coefCalculation()
        return self

    def predict(self, X):
        #I'm pretty sure that I have everything set up, now it's just deliver the prediction with the new input data!
        # Check is fit had been called
        #check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        #X = check_array(X)
        classification = True
        SAXdf = self.GetSAXCounts(X)
        SAXdf.rename(index=str, columns={0:'input'}, inplace=True)
        SAXtestdf = sparseSAXMatrix.merge(SAXdf, how='left', left_index=True, right_index=True)
        SAXtestdf.fillna(0, inplace=True)
        distances = euclidean_distances(SAXtestdf.transpose(), SAXtestdf.transpose())[-1].tolist()
        distances = list(filter(lambda a: a != 0, distances))
        avg = np.mean(distances)
        std_devs = np.abs(avg - self.avg_codistance)/self.std_dev
        if std_devs > self.sensitivity:
            classification = False

        #TODO: calculate confidence and use that for sensitivity instead of #of std devs
        
        return {
                'classification': classification,
                'confidence': np.nan
                }

In [2]:
SAXcharacters = {
                    "a": 4.5, 
                    "b": 2.5,
                    "c": 1.5,
                    "d": .5,
                    "e": 0,
                    "f": -.75,
                    "g": -1.75,
                    "h": False
                }

GoodData = []

for file in [
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized0.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized1.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized2.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized3.csv",
        "C:\DataScience\predictive-maintenance\DataExtracts\PumpData\GoodNormalized4.csv"
    ]:
    GoodData.append(pd.read_csv(file))

model = BagOShapesClassifier()
model.fit(X_list=GoodData, SAXparameters=SAXcharacters, SAXstringlength=4, std_devSensitivity=2)

NameError: name 'chunkify' is not defined

In [78]:
model.predict(pd.read_csv("C:\DataScience\predictive-maintenance\DataExtracts\PumpData\BadNormalized2.csv"))

{'classification': True, 'confidence': nan}

In [86]:
model.sparseSAXMatrix

Unnamed: 0,Good0,Good1,Good2,Good3,Good4,Bad0,Bad1,Bad2,4
aaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aabb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
model.test

array([[   0.        ,   46.021734  ,   49.53786431,   39.44616585,
          47.32863826,   80.22468448,  153.45357604,   71.28814768,
          47.32863826],
       [  46.021734  ,    0.        ,   34.84250278,   40.        ,
          35.29872519,   85.42833254,  156.26899884,   56.05354583,
          35.29872519],
       [  49.53786431,   34.84250278,    0.        ,   35.86084215,
          31.17691454,   83.48652586,  149.49916388,   48.62098312,
          31.17691454],
       [  39.44616585,   40.        ,   35.86084215,    0.        ,
          34.92849839,   82.25569889,  151.93419628,   55.15432893,
          34.92849839],
       [  47.32863826,   35.29872519,   31.17691454,   34.92849839,
           0.        ,   86.40601831,  155.53777676,   51.82663408,    0.        ],
       [  80.22468448,   85.42833254,   83.48652586,   82.25569889,
          86.40601831,    0.        ,   90.35485598,   73.67496183,
          86.40601831],
       [ 153.45357604,  156.26899884,  149.49916