<a href="https://colab.research.google.com/github/arushi-lu/deep_learning/blob/main/CNN_Opp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import math
import h5py

In [3]:
activities = {1: 'stand',
              2: 'walk',
              4: 'sit',
              5: 'lie',
              101: 'relaxing',
              102: 'coffee time',
              103: 'early morning',
              104: 'cleanup',
              105: 'sandwich time'
               }

In [4]:
def read_files():
    #pick partial data from dataset
    list_of_files = ['/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL1.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL2.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL3.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL4.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL1.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL2.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL3.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL4.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL1.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL2.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL3.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL4.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S4-ADL1.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S4-ADL2.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S4-ADL3.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S4-ADL4.dat',
                     ]

    list_of_drill = ['/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-Drill.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-Drill.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-Drill.dat',
                     '/content/drive/MyDrive/OpportunityUCIDataset/dataset/S4-Drill.dat',
                     ]

    col_names = []

    with open('/content/drive/MyDrive/OpportunityUCIDataset/dataset/column_names.txt','r') as f:# a file with all column names was created
        lines = f.read().splitlines()
        for line in lines:
            col_names.append(line)
    print(len(col_names))

    dataCollection = pd.DataFrame()
    for i, file in enumerate(list_of_files):
        print(file," is reading...")
        procData = pd.read_table(file, header=None, sep='\s+')
        procData.columns = col_names
        procData['file_index'] = i # put the file index at the end of the row
        dataCollection = pd.concat([dataCollection, procData], ignore_index=True)
        #break; # for testing short version, need to delete later
    dataCollection.reset_index(drop=True, inplace=True)

    return dataCollection

In [5]:
def dataCleaning(dataCollection):
    dataCollection = dataCollection.loc[:,dataCollection.isnull().mean()< 0.1] #drop the columns which has NaN over 10%
    #print(list(dataCollection.columns.values))
    dataCollection = dataCollection.drop(['MILLISEC', 'LL_Left_Arm','LL_Left_Arm_Object','LL_Right_Arm','LL_Right_Arm_Object', 'ML_Both_Arms'],
                                        axis = 1)  # removal of columns not related, may include others.

    dataCollection = dataCollection.apply(pd.to_numeric, errors = 'coerce') #removal of non numeric data in cells

    print(dataCollection.isna().sum().sum())#count all NaN
    print(dataCollection.shape)
    #dataCollection = dataCollection.dropna()
    dataCollection = dataCollection.interpolate()
    print(dataCollection.isna().sum().sum())#count all NaN
    #removal of any remaining NaN value cells by constructing new data points in known set of data points
    #for i in range(0,4):
    #    dataCollection["heartrate"].iloc[i]=100 # only 4 cells are Nan value, change them manually
    print("data cleaned!")
    return dataCollection

In [6]:
def reset_label(dataCollection, locomotion):
    # Convert original labels {1, 2, 4, 5, 101, 102, 103, 104, 105} to new labels.
    mapping = {1:1, 2:2, 5:0, 4:3, 101: 0, 102:1, 103:2, 104:3, 105:4} # old activity id to new activity Id
    if locomotion: #new labels [0,1,2,3]
        for i in [5,4]: # reset ids in Locomotion column
            dataCollection.loc[dataCollection.Locomotion == i, 'Locomotion'] = mapping[i]
    else: # reset the high level activities ; new labels [0,1,2,3,4]
        for j in [101,102,103,104,105]:# reset ids in HL_activity column
            dataCollection.loc[dataCollection.HL_Activity == j, 'HL_Activity'] = mapping[j]
    return dataCollection

In [7]:
def segment_locomotion(dataCollection, window_size): # segment the data and create a dataset with locomotion classes as labels
    #remove locomotions with 0
    dataCollection = dataCollection.drop(dataCollection[dataCollection.Locomotion == 0].index)
    # reset labels
    dataCollection= reset_label(dataCollection,True)
    #print(dataCollection.columns)
    loco_i = dataCollection.columns.get_loc("Locomotion")
    #convert the data frame to numpy array
    data = dataCollection.to_numpy()
    #segment the data
    n = len(data)
    X = []
    y = []
    start = 0
    end = 0
    while start + window_size - 1 < n:
        end = start + window_size-1
        if data[start][loco_i] == data[end][loco_i] and data[start][-1] == data[end][-1] : # if the frame contains the same activity and from the file
            X.append(data[start:(end+1),0:loco_i])
            y.append(data[start][loco_i])
            start += window_size//2 # 50% overlap
        else: # if the frame contains different activities or from different objects, find the next start point
            while start + window_size-1 < n:
                if data[start][loco_i] != data[start+1][loco_i]:
                    break
                start += 1
            start += 1
    print(np.asarray(X).shape, np.asarray(y).shape)
    return {'inputs' : np.asarray(X), 'labels': np.asarray(y,dtype=int)}


In [8]:
def segment_high_level(dataCollection, window_size): # segment the data and create a dataset with high level activities as labels
    #remove locomotions with 0
    dataCollection = dataCollection.drop(dataCollection[dataCollection.HL_Activity == 0].index)
    # reset labels
    dataCollection= reset_label(dataCollection,False)
    #print(dataCollection.columns)
    HL_Activity_i = dataCollection.columns.get_loc("HL_Activity")
    #convert the data frame to numpy array
    data = dataCollection.to_numpy()
    #segment the data
    n = len(data)
    X = []
    y = []
    start = 0
    end = 0
    while start + window_size - 1 < n:
        end = start + window_size-1
        if data[start][HL_Activity_i] == data[end][HL_Activity_i] and data[start][-1] == data[end][-1] : # if the frame contains the same activity and from the file
            #print(data[start:(end+1),0:(HL_Activity_i)])
            X.append(data[start:(end+1),0:(HL_Activity_i-1)])# slice before locomotion
            y.append(data[start][HL_Activity_i])
            start += window_size//2 # 50% overlap
        else: # if the frame contains different activities or from different objects, find the next start point
            while start + window_size-1 < n:
                if data[start][HL_Activity_i] != data[start+1][HL_Activity_i]:
                    break
                start += 1
            start += 1
    print(np.asarray(X).shape, np.asarray(y).shape)
    return {'inputs' : np.asarray(X), 'labels': np.asarray(y,dtype=int)}

In [9]:
def save_data(data,file_name): # save the data in h5 format
    f = h5py.File(file_name,'w')
    for key in data:
        print(key)
        f.create_dataset(key,data = data[key])
    f.close()
    print('Done.')

In [10]:
if __name__ == "__main__":
    window_size = 25
    df = read_files()
    df = dataCleaning(df)
    #plot_series(df, colname, act, file_index, start, end)
    #plot_series(df, "Acc-RKN^-accX", 4, 2, 100, 150)

    loco_filename = "loco_2.h5" # "loco.h5" is to save locomotion dataset.
    data_loco = segment_locomotion(df, window_size)
    save_data(data_loco,loco_filename)

    hl_filename = "hl_2.h5" #"hl.h5" is to save high level dataset
    data_hl = segment_high_level(df, window_size)
    save_data(data_hl,hl_filename)


250
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL1.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL2.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL3.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S1-ADL4.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL1.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL2.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL3.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S2-ADL4.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL1.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL2.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL3.dat  is reading...
/content/drive/MyDrive/OpportunityUCIDataset/dataset/S3-ADL4.dat  is reading...
/content/drive/MyDrive/OpportunityUC

In [12]:
import tensorflow as tf
from sklearn import metrics
import h5py
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout, SimpleRNN, GRU, LSTM, GlobalMaxPooling1D,GlobalMaxPooling2D,MaxPooling2D,BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

In [13]:
class models():
    def __init__(self, path):
        self.path = path


    def read_h5(self):
        f = h5py.File(path, 'r')
        X = f.get('inputs')
        y = f.get('labels')
        #print(type(X))
        #print(type(y))
        self.X = np.array(X)
        self.y = np.array(y)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.4, random_state = 1)

        print("X = ", self.X.shape)
        print("y =",self.y.shape)
        print(set(self.y))
        #return X,y

    def cnn_model(self):
       # K = len(set(y_train))
        #print(K)
        K = len(set(self.y))
        #X = np.expand_dims(X, -1)
        self.x_train = np.expand_dims(self.x_train, -1)
        self.x_test = np.expand_dims(self.x_test,-1)
        #print(X)
        #print(X[0].shape)
        #i = Input(shape=X[0].shape)
        i = Input(shape=self.x_train[0].shape)
        x = Conv2D(32, (3,3), strides = 2, activation = 'relu',padding='same',kernel_regularizer=regularizers.l2(0.0005))(i)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2,2))(x)
        x = Dropout(0.2)(x)
        x = Conv2D(64, (3,3), strides = 2, activation = 'relu',padding='same',kernel_regularizer=regularizers.l2(0.0005))(x)
        x = BatchNormalization()(x)
        x = Dropout(0.4)(x)
        x = Conv2D(128, (3,3), strides = 2, activation = 'relu',padding='same',kernel_regularizer=regularizers.l2(0.0005))(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2,2))(x)
        x = Dropout(0.2)(x)
        x = Flatten()(x)
        x = Dropout(0.2)(x)
        x = Dense(1024,activation = 'relu')(x)
        x = Dropout(0.2)(x)
        x = Dense(K, activation = 'softmax')(x)
        self.model = Model(i,x)
        self.model.compile(optimizer = Adam(lr=0.001),
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

        #self.r = model.fit(X, y, validation_split = 0.4, epochs = 50, batch_size = 32 )
        self.r = self.model.fit(self.x_train, self.y_train, validation_data = (self.x_test, self.y_test), epochs = 50, batch_size = 32 )
        print(self.model.summary())
        # It is better than using keras do the splitting!!
        return self.r

In [14]:
if __name__ == "__main__":
    model_name = "cnn" # can be cnn/dnn/rnn
    loco = False # True is to use locomotion as labels. False is to use high level activities as labels
    path = ""
    if loco:
        path = "loco_2.h5"
    else:
        path = "hl_2.h5"

    oppo = models(path)

    print("read h5 file....")
    oppo.read_h5()
    if model_name == "cnn":
        oppo.cnn_model()


read h5 file....
X =  (34181, 25, 220)
y = (34181,)
{0, 1, 2, 3, 4}




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 25, 220, 1)]      0         
                                                                 
 conv2d (Conv2D)             (None, 13, 110, 32)       320       
                                                                