# About Data

In [1]:
import pandas as pd
import re

In [26]:
data = pd.read_csv('letter_data.csv')

In [27]:
data.isnull().sum()

Label            0
Accelerometer    0
Gyroscope        0
dtype: int64

In [28]:
x = data.drop('Label',axis=1)
y = data['Label']

In [29]:
x.head()

Unnamed: 0,Accelerometer,Gyroscope
0,"[[0.78, -0.09, -0.1], [-0.95, -0.69, -0.15], [...","[[404.91, -465.21, -15.08], [83.01, -498.9, -1..."
1,"[[1.44, 0.68, -1.29], [-3.05, -0.74, -0.56], [...","[[45.78, -538.7, -90.33], [-85.51, -729.74, -7..."
2,"[[-0.36, -0.31, -0.07], [-1.64, -0.47, 1.06], ...","[[60.3, -361.94, 39.0], [112.61, -346.98, 101...."
3,"[[0.73, 0.4, -0.84], [-2.17, -0.75, -1.03], [-...","[[19.65, -506.53, -67.75], [42.05, -668.4, -25..."
4,"[[1.53, 0.32, -1.2], [-0.02, -0.14, -2.16], [-...","[[184.81, -361.15, -25.45], [113.65, -707.15, ..."


In [30]:
y.head()

0    A
1    A
2    A
3    A
4    A
Name: Label, dtype: object

# sampling

In [31]:
import numpy as np

In [32]:
def str2list(stringed_values):
    numeric_values = re.findall(r"[-+]?\d*\.\d+|[-+]?\d+", stringed_values)
    numeric_values_float = [float(value) for value in numeric_values]
    return numeric_values_float

In [33]:
def listoflist(n,list):
    list_of_lists = [list[i:i+n] for i in range(0, len(list), n)]
    return list_of_lists

In [56]:
def preprocessing(acc,gyro):
    """
    takes in acc and gyro string lists form the reader and then
    converts it into 2d list with float values
    scales it using standard scaler using pre defined StandardScaler()
    decreases its dimensionality using pre defined PCA()
    """
    merged_array = np.concatenate(acc, gyro)
    
    new_arr = interpolate_to_60(merged_array)
    new_arr = downsample_sequence(new_arr)
    two_d_list1 = []
    for sublist in new_arr:
        flattened_sublist1 = [item for inner_list in sublist for item in inner_list]
        two_d_list1.append(flattened_sublist1)
        
    df1 = pd.DataFrame(two_d_list1)

    scaler = StandardScaler()
    pca = PCA(n_components=100)  
    df_scaled1 = scaler.transform(df1)
    
    df_reduced1 = pca.transform(df_scaled1)
    df_reduced1 = pd.DataFrame(df_reduced1)

    return df_reduced1

    

In [35]:
acc = x.Accelerometer
acc

0      [[0.78, -0.09, -0.1], [-0.95, -0.69, -0.15], [...
1      [[1.44, 0.68, -1.29], [-3.05, -0.74, -0.56], [...
2      [[-0.36, -0.31, -0.07], [-1.64, -0.47, 1.06], ...
3      [[0.73, 0.4, -0.84], [-2.17, -0.75, -1.03], [-...
4      [[1.53, 0.32, -1.2], [-0.02, -0.14, -2.16], [-...
                             ...                        
886    [[-0.34, -0.36, 0.42], [-0.76, -0.04, 1.19], [...
887    [[0.03, 0.1, 0.18], [-0.52, -0.57, 0.51], [-1....
888    [[-0.02, 0.39, 0.28], [-0.52, -0.05, 0.6], [-0...
889    [[-0.15, 0.38, 0.53], [-0.4, 0.08, 0.56], [-0....
890    [[-0.19, -0.04, 0.4], [-0.71, -0.07, 1.22], [-...
Name: Accelerometer, Length: 891, dtype: object

In [36]:
acc_with_gyro = [x.Accelerometer[i]+x.Gyroscope[i] for i in range(len(acc))]

In [37]:
type(acc_with_gyro)

list

In [38]:
acc_num = [str2list(a) for a in acc_with_gyro]

In [39]:
# acc_num

In [40]:
len(acc_num[0])

360

In [41]:
acc_feature_lists = [listoflist(6,a) for a in acc_num]

In [42]:
acc_feature_lists

[[[0.78, -0.09, -0.1, -0.95, -0.69, -0.15],
  [-3.93, -1.09, 2.78, -1.17, -0.41, 0.98],
  [0.33, -0.26, 0.23, -0.78, 0.08, 0.87],
  [-0.55, -0.01, 0.58, -0.41, 0.25, 0.44],
  [-1.26, -0.13, 1.12, -0.68, -0.29, 0.61],
  [-0.58, -0.36, 0.54, -0.75, -0.2, 0.74],
  [-0.66, -0.09, 0.67, -0.78, -0.22, 0.65],
  [-0.68, -0.1, 0.78, -0.63, -0.23, 0.61],
  [-0.79, -0.21, 0.69, -0.78, -0.11, 0.75],
  [-0.76, -0.08, 0.78, -0.53, 0.05, 0.79],
  [-0.6, -0.05, 0.86, -0.72, -0.16, 0.78],
  [-0.64, -0.05, 0.81, -0.7, -0.1, 0.69],
  [-0.78, -0.23, 0.63, -0.81, -0.17, 0.63],
  [-0.85, -0.18, 0.56, -0.86, -0.23, 0.59],
  [-0.78, -0.2, 0.62, -0.85, -0.14, 0.63],
  [-0.84, -0.09, 0.65, -0.83, -0.05, 0.64],
  [-0.81, -0.12, 0.65, -0.86, -0.22, 0.61],
  [-0.78, -0.16, 0.66, -0.76, -0.15, 0.67],
  [-0.74, -0.15, 0.72, -0.65, -0.16, 0.84],
  [-0.67, -0.22, 0.84, -0.8, -0.28, 0.73],
  [-0.85, -0.32, 0.53, -0.66, -0.28, 0.68],
  [-0.6, -0.45, 0.6, -0.56, -0.26, 0.55],
  [-0.72, -0.11, 0.69, -0.67, -0.02, 0.61],
 

In [43]:
len(acc_feature_lists[0])

60

In [44]:
trimmed_lists = [lst[(len(lst)-35)//2:(len(lst)+35)//2] for lst in acc_feature_lists]

In [45]:
trimmed_lists

[[[-0.78, -0.23, 0.63, -0.81, -0.17, 0.63],
  [-0.85, -0.18, 0.56, -0.86, -0.23, 0.59],
  [-0.78, -0.2, 0.62, -0.85, -0.14, 0.63],
  [-0.84, -0.09, 0.65, -0.83, -0.05, 0.64],
  [-0.81, -0.12, 0.65, -0.86, -0.22, 0.61],
  [-0.78, -0.16, 0.66, -0.76, -0.15, 0.67],
  [-0.74, -0.15, 0.72, -0.65, -0.16, 0.84],
  [-0.67, -0.22, 0.84, -0.8, -0.28, 0.73],
  [-0.85, -0.32, 0.53, -0.66, -0.28, 0.68],
  [-0.6, -0.45, 0.6, -0.56, -0.26, 0.55],
  [-0.72, -0.11, 0.69, -0.67, -0.02, 0.61],
  [-0.89, -0.08, 0.81, -0.86, -0.1, 0.76],
  [-0.57, -0.12, 0.56, -0.77, -0.16, 0.76],
  [-0.79, -0.16, 0.77, -0.73, 0.2, 0.88],
  [-0.71, 0.3, 0.8, -0.75, -0.14, 0.68],
  [-0.76, -0.59, 0.62, -0.75, -0.53, 0.6],
  [-0.76, -0.1, 0.83, -0.86, -0.08, 0.85],
  [-1.07, 0.17, 1.22, -1.04, -0.23, 1.21],
  [404.91, -465.21, -15.08, 83.01, -498.9, -107.48],
  [20.69, -342.71, 114.56, 82.82, 55.42, 36.68],
  [-88.68, 6.47, -16.42, -170.72, -34.61, -8.61],
  [2.99, 33.75, 40.41, 13.06, -13.24, 20.08],
  [31.37, 16.54, 10.5, 

In [46]:
two_d_list = []
for sublist in trimmed_lists:
    flattened_sublist = [item for inner_list in sublist for item in inner_list]
    two_d_list.append(flattened_sublist)

In [47]:
two_d_list

[[-0.78,
  -0.23,
  0.63,
  -0.81,
  -0.17,
  0.63,
  -0.85,
  -0.18,
  0.56,
  -0.86,
  -0.23,
  0.59,
  -0.78,
  -0.2,
  0.62,
  -0.85,
  -0.14,
  0.63,
  -0.84,
  -0.09,
  0.65,
  -0.83,
  -0.05,
  0.64,
  -0.81,
  -0.12,
  0.65,
  -0.86,
  -0.22,
  0.61,
  -0.78,
  -0.16,
  0.66,
  -0.76,
  -0.15,
  0.67,
  -0.74,
  -0.15,
  0.72,
  -0.65,
  -0.16,
  0.84,
  -0.67,
  -0.22,
  0.84,
  -0.8,
  -0.28,
  0.73,
  -0.85,
  -0.32,
  0.53,
  -0.66,
  -0.28,
  0.68,
  -0.6,
  -0.45,
  0.6,
  -0.56,
  -0.26,
  0.55,
  -0.72,
  -0.11,
  0.69,
  -0.67,
  -0.02,
  0.61,
  -0.89,
  -0.08,
  0.81,
  -0.86,
  -0.1,
  0.76,
  -0.57,
  -0.12,
  0.56,
  -0.77,
  -0.16,
  0.76,
  -0.79,
  -0.16,
  0.77,
  -0.73,
  0.2,
  0.88,
  -0.71,
  0.3,
  0.8,
  -0.75,
  -0.14,
  0.68,
  -0.76,
  -0.59,
  0.62,
  -0.75,
  -0.53,
  0.6,
  -0.76,
  -0.1,
  0.83,
  -0.86,
  -0.08,
  0.85,
  -1.07,
  0.17,
  1.22,
  -1.04,
  -0.23,
  1.21,
  404.91,
  -465.21,
  -15.08,
  83.01,
  -498.9,
  -107.48,
  20.69,
  -342.

In [48]:
len(two_d_list)

891

In [49]:
len(two_d_list[0])

210

# DF2 - what we need

In [50]:
df2 = pd.DataFrame(two_d_list)

In [51]:
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,-0.78,-0.23,0.63,-0.81,-0.17,0.63,-0.85,-0.18,0.56,-0.86,...,-22.03,-8.97,-5.98,-10.07,19.10,-22.28,6.59,13.92,-20.69,5.19
1,-0.90,-0.17,0.49,-1.03,-0.00,0.43,-0.92,-0.07,0.53,-0.91,...,-15.38,12.57,0.37,-17.70,61.71,-21.55,5.98,52.86,-30.21,13.24
2,-0.69,-0.19,0.71,-0.63,-0.16,0.79,-0.55,-0.17,0.87,-0.58,...,9.09,-49.19,10.68,-11.11,-22.46,-0.55,-16.72,20.81,-16.54,-6.53
3,-0.78,-0.09,0.68,-0.67,-0.10,0.75,-0.67,-0.12,0.78,-0.59,...,-10.74,8.36,-6.65,-12.15,35.10,-18.37,3.66,33.87,-23.86,6.41
4,-0.81,-0.09,0.66,-0.80,-0.09,0.67,-0.81,-0.19,0.61,-0.81,...,-2.81,4.27,-17.15,0.00,6.90,-22.89,2.56,6.35,26.43,-1.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.83,0.09,0.37,-0.93,-0.08,0.41,-0.96,-0.14,0.35,-1.11,...,15.69,24.41,-52.86,59.63,3.97,-76.54,82.58,-18.80,-58.29,60.30
887,-1.01,-0.21,0.40,-1.11,0.04,0.34,-1.09,0.13,0.33,-1.18,...,52.12,-11.17,-96.98,83.62,-53.59,-59.88,58.23,-39.49,-49.01,33.94
888,-0.97,0.20,0.28,-1.03,0.31,0.25,-1.04,0.18,0.21,-0.97,...,49.99,-12.33,-39.00,24.05,-46.57,-38.09,16.54,-60.06,-24.11,4.15
889,-0.82,0.01,0.32,-1.09,0.16,0.47,-1.16,0.61,0.30,-1.02,...,-2.99,20.26,-14.95,28.32,32.53,-59.39,49.32,24.41,-63.90,65.12


In [52]:
columns2 = ['Feature'+str(i) for i in range(df2.shape[1])]
df2.columns = columns2

In [53]:
df2

Unnamed: 0,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature200,Feature201,Feature202,Feature203,Feature204,Feature205,Feature206,Feature207,Feature208,Feature209
0,-0.78,-0.23,0.63,-0.81,-0.17,0.63,-0.85,-0.18,0.56,-0.86,...,-22.03,-8.97,-5.98,-10.07,19.10,-22.28,6.59,13.92,-20.69,5.19
1,-0.90,-0.17,0.49,-1.03,-0.00,0.43,-0.92,-0.07,0.53,-0.91,...,-15.38,12.57,0.37,-17.70,61.71,-21.55,5.98,52.86,-30.21,13.24
2,-0.69,-0.19,0.71,-0.63,-0.16,0.79,-0.55,-0.17,0.87,-0.58,...,9.09,-49.19,10.68,-11.11,-22.46,-0.55,-16.72,20.81,-16.54,-6.53
3,-0.78,-0.09,0.68,-0.67,-0.10,0.75,-0.67,-0.12,0.78,-0.59,...,-10.74,8.36,-6.65,-12.15,35.10,-18.37,3.66,33.87,-23.86,6.41
4,-0.81,-0.09,0.66,-0.80,-0.09,0.67,-0.81,-0.19,0.61,-0.81,...,-2.81,4.27,-17.15,0.00,6.90,-22.89,2.56,6.35,26.43,-1.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.83,0.09,0.37,-0.93,-0.08,0.41,-0.96,-0.14,0.35,-1.11,...,15.69,24.41,-52.86,59.63,3.97,-76.54,82.58,-18.80,-58.29,60.30
887,-1.01,-0.21,0.40,-1.11,0.04,0.34,-1.09,0.13,0.33,-1.18,...,52.12,-11.17,-96.98,83.62,-53.59,-59.88,58.23,-39.49,-49.01,33.94
888,-0.97,0.20,0.28,-1.03,0.31,0.25,-1.04,0.18,0.21,-0.97,...,49.99,-12.33,-39.00,24.05,-46.57,-38.09,16.54,-60.06,-24.11,4.15
889,-0.82,0.01,0.32,-1.09,0.16,0.47,-1.16,0.61,0.30,-1.02,...,-2.99,20.26,-14.95,28.32,32.53,-59.39,49.32,24.41,-63.90,65.12


# PCA

In [54]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
df_scaled = scaler.fit_transform(df2)

In [55]:
pca = PCA(n_components=50)  
df_reduced = pca.fit_transform(df_scaled)

df_reduced = pd.DataFrame(df_reduced)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
df_reduced

# Train, Test parts

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder= LabelEncoder()
sparse_Encoded = encoder.fit_transform(y)
# sparse_Encoded

In [None]:
X_total = df_reduced.copy()
Y_total = pd.Series(sparse_Encoded)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X_total,Y_total,test_size=0.2,random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

# Defining model libraries

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


# Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

input_shape = (50,1) 

model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(11, activation='softmax')  
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Fit the model
model.fit(x_train, y_train, epochs=100, validation_split=0.2,
         # callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True), 
         #            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, min_lr=1e-6)]
         )


In [None]:
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print(f'\nTest accuracy: {test_acc}')

# Y-prediction

In [None]:
def prediction(x_test1):
    y_pred1 = model.predict(x_test1)
    y_pred_vals1 = np.argmax(y_pred1,axis=1)
    labels1 = encoder.inverse_transform(y_pred_vals1)
    labels_series1 = pd.Series(labels1)
    return labels_series1

In [None]:
a = prediction(x_test)

In [None]:
print(a)

# Trying new data

In [None]:
new_data_df = preprocessing(x.Accelerometer[51],x.Gyroscope[51])
new_data_df

In [None]:
b = prediction(new_data_df)

In [None]:
print(b)