## Task 1 : Hand Gesture Classification using EMG

### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.subplots import make_subplots
import h5py 

py.offline.init_notebook_mode(connected=True)

%matplotlib inline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.utils import class_weight
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, GRU, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import (ModelCheckpoint,
                             TensorBoard, ReduceLROnPlateau,
                             CSVLogger, EarlyStopping)
from keras.backend.tensorflow_backend import set_session
from keras.utils.np_utils import to_categorical

### Load and Visualize Data

In [None]:
file1 = r"C:\Users\Aditi\Google Drive\Internship\Insai\EMG_data_for_gestures-master\EMG_data_for_gestures-master\01\1_raw_data_13-12_22.03.16.txt"
file2 = r"C:\Users\Aditi\Google Drive\Internship\Insai\EMG_data_for_gestures-master\EMG_data_for_gestures-master\01\2_raw_data_13-13_22.03.16.txt"


In [None]:
df = pd.read_csv(file2, delimiter="\t")
df.head()

In [None]:
#Plot EMG Data
layout = go.Layout(title = 'Person 1 Data (All Channels)')

data = []
for i in range(1,9):
    trace = go.Scatter(
        x = df['time'],
        y = df['channel'+str(i)],
        name = 'channel'+str(i),
        mode = 'lines'
    )
    data.append(trace)

fig = go.Figure(data = data, layout = layout)
py.offline.iplot(fig)


#### Quick Note
Plotly, in my opinion, is preferable over matplotlib for a few reasons
<br> Most importantly, it has dynamic zoom and pan. This feature is essentially perfect when working with time-series data. <br>
It is a little more complex to code, but worth it.

In [None]:
fig = make_subplots(rows=2, cols=1
                    , row_heights=[0.7, 0.3],
                     subplot_titles=("Signal Channels", "Gesture Classes"))
fig.update_layout(title = "Subject 1, Trial 2, EMG Data")                   

for i in range(1,9):
    trace = go.Scatter(
        x = df['time'],
        y = df['channel'+str(i)],
        name = 'Channel '+str(i),
        mode = 'lines'
    )
    fig.add_trace(trace, row=1, col=1)

fig.add_trace(go.Scatter(x = df['time'],
        y = df['class'], name='Gesture Class'),
              row=2, col=1)

#comment this line incase full graph with Legend is not visible 
fig.update_layout(autosize=False,width = 1100, height=700)

fig.show()

In [None]:
df1 = pd.read_csv(file1, delimiter="\t")
df1.head()

In [None]:
fig = make_subplots(rows=2, cols=1
                    , row_heights=[0.7, 0.3],
                     subplot_titles=("Signal Channels", "Gesture Classes"))
fig.update_layout(title = "Subject 1, Trial 1, EMG Data")                   

for i in range(1,9):
    trace = go.Scatter(
        x = df1['time'],
        y = df1['channel'+str(i)],
        name = 'Channel '+str(i),
        mode = 'lines'
    )
    fig.add_trace(trace, row=1, col=1)

fig.add_trace(go.Scatter(x = df1['time'],
        y = df1['class'], name='Gesture Class'),
              row=2, col=1)

#comment this line incase full graph with Legend is not visible 
fig.update_layout(autosize=False,width = 1100, height=700)

fig.show()

In [None]:
df.describe()

In [None]:
df1.describe()

- From the descriptions, we can see that all the channels have a similar range and scale. 
- This means that Normalization and Scaling is not necessary

In [None]:
class_dist = df1.pivot_table(index=['class'], aggfunc='size')
print(class_dist)

In [None]:
class_dist = df.pivot_table(index=['class'], aggfunc='size')
print(class_dist)

In [None]:
np.std(class_dist[1:])

- The imbalance in the data is mostly due to the "Unmarked Data" from Class 0
- Rest of the data seems to be more or less of equal ratio

## Log Regression

#### Using each channel separately as a feature
Adding both trials together since they are from the same subject

In [None]:
df_total = pd.concat([df, df1])
y = df_total['class'].to_numpy()
X = df_total.drop(['time','class'], axis=1)
X.head()

In [None]:
X = normalize(X, axis=0) #not necessary, but is preferred.

In [None]:
X.shape, y.shape

In [None]:
multi_class = 'ovr'
# multi_class = 'multinomial'

In [None]:
clf = LogisticRegression(solver='liblinear',  random_state=42,
                             multi_class=multi_class).fit(X, y)

In [None]:
clf.score(X,y)

- Tried multiple combinations of parameters <br>
- Final accuracy does not change. <br>
- Logistic Regression thus gives a max accuracy of about 65% <br>

#### Analysis
This could be due to a number of reasons:
- It is likely that the imbalance in the data has led to this
- Additionally, with a logistic classifier and time series data, a high accuracy cannot be expected

### Trial 2
Dropping the class 0 entirely

In [None]:
df_total.head(1)

In [None]:
X = df_total[df_total['class']!=0]
y = X['class'].to_numpy()
X = X.drop(['time', 'class'], axis=1)
X.shape, y.shape

In [None]:
clf = LogisticRegression(solver='liblinear', random_state=42,
                             multi_class=multi_class).fit(X, y)

In [None]:
clf.score(X,y)

#### Analysis

- By the vast difference in the accuracies, it can be inferred that the previous classifier only did well because it mostly classified the inputs at Class 0.
- Due to the large difference in distributions, the classifier gets best accuracy by simply calling all inputs as Class 0
- This can be rectified by assigning weights to the classes

In [None]:
clf = LogisticRegression(solver='liblinear', max_iter=100, random_state=42,
                             multi_class=multi_class, penalty='l1', class_weight='balanced').fit(X, y)

In [None]:
clf.score(X,y)


<b> The accuracy still does not seem to move higher than 65%


### Recurrent Neural Network

<b> Approach : The data is a time-series; Use chunks of the data as input

- For example, the class 1 has about 1500 of sequential data at 2 different times
- So, take sections of the data from a moving window of size 150 like: 
        1-150, 2-151, 3-152 and so on.
- For each of these sequences, the output class is 1
- This will help increase the amount fo input data, while also exploting the time-series nature of the dataset
        

Input shape will come to be : 
<br> <b> n_samples x n_"features"(150) x n_channels(8)

- Additionally, the Recurrent Neural Network is one of the best approaches for time-series' predictions.
- While CNNs may be better overall, the noisiness of the EMG data might yield poorer results.
- Treating them as sequence sections that the Gated recurrent network can pick up is a good approach

In [None]:
def data_to_seq(data, seq_length):
    n = len(data)
    out = []
    for i in range(n-seq_length+1):
        seq = data[i:i+seq_length]
        out.append(seq)
    return np.array(out)

In [None]:
X = np.empty((1,8,150))
y = []
for Class in range(max(df_total['class']) +1):
    df_class = df_total[df_total['class']==Class].drop(['time', 'class'], axis=1)
    class_seq = []
    for channel in df_class.columns:
        chn = df_class[channel]
        seq = data_to_seq(chn, 150)
        class_seq.append(seq)
    class_seq = np.array(class_seq)
    print(class_seq.shape)
    y = y + [Class]*class_seq.shape[1]
    X = np.concatenate([X,class_seq.reshape(class_seq.shape[1], 8, 150)])

print(X.shape)

# X = np.vstack(X.reshape(X.shape[0],7162,8,150))

In [None]:
X = X[1:]

In [None]:
y = np.array(y)
y = to_categorical(y)

In [None]:
X.shape, y.shape

#### Writing Data to a file and saving, since the above cells take a lot of time to run
(The Data file will be uploaded on GDrive and the link has been given in the readme file)

In [None]:
#Using h5py
hf = h5py.File('Task1_sequences.h5', 'w')
hf.create_dataset('X', data=b)
hf.create_dataset('y', data=y)
hf.create_dataset('info', data='Sequences of sliced data from the original dataset, with slices of length 150 each along with the corresponding class values')
hf.close()

In [None]:
#Read the files like this:
hf = h5py.File('Task1_sequences.h5', 'r')
X = hf['X']
y = hf['y']
# hf.close()

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.02, random_state=42)

The Model Architecture is as follows:

In [None]:
In = Input(shape=(8,150), dtype=np.float32, name='signal')

x = In
gru1 = GRU(64, return_sequences=True, dropout=0.2, name = 'GRU-1')
x = Bidirectional(gru1, name='BiRNN-1')(x)

gru2 = GRU(128, return_sequences=False, dropout=0.2, name = 'GRU-2')
x = Bidirectional(gru2, name='BiRNN-2')(x)

x = Dense(64, name='Dense-1', activation='relu')(x)
x = Dense(16, name='Dense-2', activation='relu')(x)

diagn = Dense(7, activation='sigmoid', name = 'Dense-3')(x)
model = Model(In, diagn)


In [None]:
model.summary()

In [None]:

loss = 'binary_crossentropy'
lr = 0.001
batch_size = 64
opt = Adam(lr)

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))
model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
# callbacks = [ReduceLROnPlateau(monitor='val_loss',
#                                factor=0.1,
#                                patience=7,
#                                min_lr=lr / 10),
#              EarlyStopping(patience=9, min_delta=0.00001)]

callbacks = [TensorBoard(log_dir='./logs', batch_size=batch_size, write_graph=False),
              CSVLogger('training.log', append=False)]  # Change append to true if continuing training
# Save the BEST and LAST model
callbacks += [ModelCheckpoint('./backup_model_last.hdf5'),
              ModelCheckpoint('./backup_model_best.hdf5', save_best_only=True)]


In [None]:
class_weights = class_weight.compute_class_weight('balanced',np.unique(np.argmax(y[:], axis=1)),np.argmax(y[:], axis=1))

In [None]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=100,
                    validation_split=0.05,
                    shuffle=True,
                    callbacks=callbacks,
                    verbose=1, class_weight=class_weights)

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
type(history)

In [None]:
model.save("./final_model.hdf5")

Thus, we have a RNN model with very good accuracy