# Load Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob


# Segmentation Function


In [2]:
def segmentation(x_data,overlap_rate,time_window):
    
    #Set 2 list of segment window and its label
    seg_data = []
    y_segmented_list = []

    #Convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    
    #Segment and keep its label
    for i in range(0,x_data.shape[0],overlap):
        seg_data.append(x_data[i:i+time_window])
        y_segmented_list.append(x_data['activity'][i])
        
    return seg_data,y_segmented_list


# Feature Extraction Function


In [9]:
def get_features(x_data):
    #Set features list
    features = []
    #Set columns name list
    DFclist=list(x_data.columns)

    #Calculate features (STD, Average, Max, Min) for each data columns X Y Z 1 -> 29
    for k in DFclist:
        # std
        features.append(x_data[k].std(ddof=0))
        # avg
        features.append(np.average(x_data[k]))
        # max
        features.append(np.max(x_data[k]))
        # min
        features.append(np.min(x_data[k]))
    return features


# Handle dataset


In [4]:
#Load all data file
data_list=glob('drive/MyDrive/Research/Challenge 2021/IC2021/Dataset/*')    

#Set train/test list
train_features_list=[]
train_label_list=[]
test_features_list=[]
test_label_list=[]


for data_file in data_list:
    df1=pd.read_csv(data_file)

    #Handle Missing data by interpolation
    df1_itpl=df1.interpolate()
    df1_itpl=df1_itpl.fillna(0)
    #Segmentation with overlaprate=0 & window=100
    [seg, seg_label]=segmentation(df1_itpl,0.5,350)

    #Run through segment window list to extract features
    for j in range(0,len(seg)):

        #Set user 1, 2 as train data and user 3 as test data
        if (any(seg[j]['subject_id']==1)) or (any(seg[j]['subject_id']==3)):
            
            #extract only xyz columns
            frame1=seg[j].drop(columns=['subject_id','activity'])
            frame1=frame1.reset_index()
            frame1=frame1.drop(columns=['index'])

            #Get features and label for each elements
            train_features_list.append(get_features(frame1))
            train_label_list.append(seg_label[j])
        else:
            #extract only xyz columns
            frame1=seg[j].drop(columns=['subject_id','activity'])
            frame1=frame1.reset_index()
            frame1=frame1.drop(columns=['index'])

            #Similar to test data
            test_features_list.append(get_features(frame1))
            test_label_list.append(seg_label[j])
            
            

In [5]:
#Check nan, infinite data
print(np.any(np.isnan(np.array(train_features_list))))
print(np.all(np.isfinite(np.array(train_features_list))))


False
True


In [6]:
len(train_features_list)


0

# Define Model & Training

In [7]:
from sklearn.ensemble import RandomForestClassifier
            
model_ml = RandomForestClassifier(n_estimators=500,n_jobs=-1)


In [8]:
model_ml.fit(train_features_list,train_label_list)


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [9]:
for features_point in train_features_list:
  if np.any(np.isnan(features_point)):
    print(train_features_list.index(features_point))
    

In [10]:
print(train_features_list[54][325])


18.562063828571436


In [11]:
print(train_features_list[783])


[272.24347732017566, 357.70788125714284, 665.37262, 11.47626, 285.36739016485734, 137.66796302857145, 506.06268, -224.51991, 41.7497724874643, 1486.5280163142859, 1562.49854, 1439.70459, 260.1879237440185, 313.826884, 613.62494, -11.4566, 244.14091882481054, 87.16081437142857, 418.16232, -202.9156, 31.63127907613555, 1599.9200467428573, 1660.3913599999996, 1563.80798, 200.54208521543868, 248.86463128571424, 474.65726, -7.49327, 190.74114020968582, 24.082028657142864, 297.46338, -174.31129, 22.567023698058968, 1527.2756587428569, 1564.4478800000004, 1482.94678, 98.45856827714545, 313.56303160000004, 419.62439000000006, 164.21536, 247.30073667667506, -87.00147828571427, 272.52454, -330.8012700000001, 16.305879423545345, 1338.486600742857, 1372.12781, 1316.06409, 52.66517800049703, 167.11233408571428, 234.5853, 95.7487, 139.82936502061372, -132.49256765714284, 85.64576, -267.78598, 3.354021793046911, 1310.3976250285714, 1324.41785, 1304.62756, 79.19543919266735, 379.83068808571426, 464.99

# Testing

In [12]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [13]:
y_predict = model_ml.predict(test_features_list)
print(classification_report(test_label_list,y_predict))
confusion_matrix(test_label_list, y_predict)


              precision    recall  f1-score   support

           1       0.79      0.05      0.10       203
           2       0.78      0.04      0.07       193
           3       0.00      0.00      0.00       192
           4       0.20      0.54      0.30       190
           5       0.38      0.62      0.47       191
           6       0.80      0.04      0.08       276
           7       0.07      0.01      0.02       218
           8       0.00      0.00      0.00       221
           9       0.18      1.00      0.31       222
          10       0.00      0.00      0.00       200

    accuracy                           0.23      2106
   macro avg       0.32      0.23      0.13      2106
weighted avg       0.33      0.23      0.13      2106



  _warn_prf(average, modifier, msg_start, len(result))


array([[ 11,   2,   0, 154,   6,   0,   5,   0,  25,   0],
       [  0,   7,   0,  27, 141,   0,   2,   0,  16,   0],
       [  3,   0,   0, 149,  22,   0,   2,   0,  16,   0],
       [  0,   0,   0, 103,  27,   0,   1,   0,  59,   0],
       [  0,   0,   0,  36, 119,   0,   0,   0,  36,   0],
       [  0,   0,   0,  35,   0,  12,  14,   0, 214,   1],
       [  0,   0,   0,   0,   0,   2,   2,   0, 214,   0],
       [  0,   0,   0,   0,   0,   0,   3,   0, 201,  17],
       [  0,   0,   0,   0,   0,   1,   0,   0, 221,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 200,   0]])