#### 1.  Importing required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

#### 2. Loading the text file timeseries dataset into pandas dataframe
###### The first column, i.e. the classes from 1 to 3, is considered as y (the target column)
###### The remaining dataframe ,except the class column is used as x

In [2]:

df = pd.read_csv("UMD_TEST.txt",delim_whitespace=True,header=None)
y = df.iloc[:, 0]      
x = df.iloc[:, 1:151]                 

#### 3. Splitting into Training and testing sets
###### Splitting x and y into X_train, X_test,Y_train,Y_test  (test set 33%). X_train is the 67% training set without the class column, X_test is the 33% test set without the class column. Y_train is the class column (labels) for rows in X_train, Y_test is the class column  (labels) for rows in Y_train

In [3]:
X_train, X_test,Y_train,Y_test = train_test_split(x,y, test_size=0.33, random_state=0)

In [4]:
X_train = pd.DataFrame(X_train)
Y_train = pd.DataFrame(Y_train)
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)
X_train.columns = range(X_train.columns.size)
X_test.columns = range(X_test.columns.size)
Y_train.columns = range(Y_train.columns.size)
Y_test.columns = range(Y_test.columns.size)


#### 4. Defining the distance function dtw 
###### It calcuates the distances based upon the absolute distances between elements of the two arrays, i.e. for every element in array 1 ,absolute distance with elements in array 2 from the (i-w )th (when i-w <1, 1 st element) element to (i+w)th element

In [5]:
def dtw(s, t, window):
    n, m = len(s), len(t)
    w = np.max([window, abs(n-m)]) # warping cannot be less than the difference in lengths. 
    dtw_matrix = np.zeros((n+1, m+1))
    
    for i in range(n+1):
        for j in range(m+1):
            dtw_matrix[i, j] = np.inf
    dtw_matrix[0, 0] = 0
    
    for i in range(1, n+1):
        for j in range(np.max([1, i-w]), np.min([m, i+w])+1):
            dtw_matrix[i, j] = 0
    
    for i in range(1, n+1):
        for j in range(np.max([1, i-w]), np.min([m, i+w])+1):
            cost = abs(s[i-1] - t[j-1])
            # take last min from a square box
            last_min = np.min([dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1]])
            dtw_matrix[i, j] = cost + last_min
    return dtw_matrix[-1,-1]

#### 5. Distance calculation for every row of test set with every row of train set using the dtw (distance time warping) function defined above.
###### First, the row in X_train with shortest distance from the first row in X_test is determined, This process is done for each of the rows of X_test.Then,the labels (values of target column, Y_train in this case) for each of these short distance rows found in X_train is used as the prediction. Here a dictionary is created for storing distances. The distances from first row of X_test to each row of X_train is stored in the distances dictionary, it is sorted ascending and the first distance is selected.The index (key in dictionary) of first distance, is searched in Y_train and the Y_train value is used as a predicted label for first row of X_test. 

In [None]:
distances = []
neighbours = []
Y_Pred = []

for test_row in range(len(X_test)):
    for train_row in range(len(X_train)):
        distances.append((train_row,dtw(X_train.iloc[train_row],X_test.iloc[test_row],window = 10)))
    distances.sort(key = lambda tup:tup[1])        
    #print(distances[0][0])
    Y_Pred.append(Y_train.iloc[distances[0][0]])
    distances = []
    


#### 6. Creating a dataframe of the predicted Labels.
###### The Y_Pred predicted above are converted to dataframe.

In [194]:
Y_Pred_df = pd.DataFrame(Y_Pred)
Y_Pred_df

Unnamed: 0,0
6,1.0
70,2.0
106,3.0
31,1.0
117,3.0
124,3.0
84,2.0
21,1.0
130,3.0
12,1.0


#### 7. The actual labels (classes) of the test set

In [206]:
Y_test

Unnamed: 0,0
7,1.0
89,2.0
97,3.0
26,1.0
110,3.0
128,3.0
59,2.0
22,1.0
129,3.0
16,1.0


#### 8. Checking the accuracy
###### For accuracy, the predicted labels (classes) and the actual labels (classes) are compared. If we observe,the Y_Pred and Y_test values, 46 out of 48 values are predicted correctly, giving an accuracy of 95.83%. Also, in the confusion matrix, we see that 16 rows are correctly labelled as class 1, 15 labels are correctly labelled as Class 2, 15 rows are correctly labelled as Class 3, 1 row is incorrectly labelled as class 2 (correct is Class 1) and  1 row is incorrectly labelled as Class 2 (correct is Class3) 

In [207]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(Y_test, Y_Pred)
ac = accuracy_score(Y_test,Y_Pred)
print(ac)
print(cm)

0.9583333333333334
[[16  1  0]
 [ 0 15  0]
 [ 0  1 15]]


#### 9. Using the Sklearn Knn immplementation with Euclidean distance metric
###### KNeighborsClassifier is imported and fitted on the X_train and Y_train with metric as 'minkowski' distance and p as 2, which actually means Euclidean distance. Euclidean distance is calculated as the square root of the sum of the squared differences between the two input sets.

In [216]:
# Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)
classifier.fit(X_train, Y_train)

# Predicting the Test set results
Y_pred_sklearn_euclid = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cme = confusion_matrix(Y_test, Y_pred_sklearn_euclid)
ace = accuracy_score(Y_test, Y_pred_sklearn_euclid)

  return self._fit(X, y)


#### 10. The predicted labels identified above.


In [218]:
Y_pred_sklearn_euclid_df = pd.DataFrame(Y_pred_sklearn_euclid)
Y_pred_sklearn_euclid_df

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,2.0
4,3.0
5,3.0
6,2.0
7,1.0
8,2.0
9,1.0


#### 11. Checking the Accuracy
###### The Knn implementation (sklearn) with Euclidean distance as metrics, gives an accuracy of 83.3%, 40 of 48 values are correctly predicted, the confusion matrix states that 13 'class1' values are correcly labelled as class1, 14 class2 values are correctly labelled as class2, 13 class3 values are correctly labelled as class3, 4 class1 values are incorrectly labelled as class2, 1 class2 value is incorrectly labelled as class3, 1 class3 value is incorrectly labelled as class1 and 2 class3 values incorrectly labelled as class2

In [219]:
ace

0.8333333333333334

In [220]:
cme

array([[13,  4,  0],
       [ 0, 14,  1],
       [ 1,  2, 13]], dtype=int64)

#### 12. Finding best window parameter
##### testing the previous implementation with dtw function, with window values 0,5,10,15,20. Calculating the accuracy and confusion matrix for each window value. The below results of accuracy and confusion matrix show that with window=0 the accuracy is 83.3%,with window 5,the accuracy is 93.75%, with window 10,15 and 20 the accuracy is 95.83%. Classification using dtw with window values 10,15 and 20 is the most accurate in the calculation with window 0,5,10,15,20. with window 0 being the least accurate.


In [225]:
distances = []
neighbours = []
Y_Pred = []
for w in range(0,21,5):
    Y_Pred = []
    for test_row in range(len(X_test)):
        for train_row in range(len(X_train)):
            distances.append((train_row,dtw(X_train.iloc[train_row],X_test.iloc[test_row],window = w)))
        distances.sort(key = lambda tup:tup[1])        
        #print(distances[0][0])
        Y_Pred.append(Y_train.iloc[distances[0][0]])
        distances = []
    cm = confusion_matrix(Y_test, Y_Pred)
    ac = accuracy_score(Y_test,Y_Pred)
    print('Acuuracy with window ',w, 'is ', ac)
    print(cm)

Acuuracy with window  0 is  0.8333333333333334
[[14  3  0]
 [ 1 14  0]
 [ 1  3 12]]
Acuuracy with window  5 is  0.9375
[[16  1  0]
 [ 1 14  0]
 [ 0  1 15]]
Acuuracy with window  10 is  0.9583333333333334
[[16  1  0]
 [ 0 15  0]
 [ 0  1 15]]
Acuuracy with window  15 is  0.9583333333333334
[[16  1  0]
 [ 0 15  0]
 [ 0  1 15]]
Acuuracy with window  20 is  0.9583333333333334
[[16  1  0]
 [ 0 15  0]
 [ 0  1 15]]


### Task 2- KNN DTW

#### 13.Testing with sklearn KnnClassifier with user defined function dtw as the metric for distance,
###### In the below execution, using KNeighborsClassifier with metric as pyfunc and metric name as dtw,we pass window value as 10 and compare the results with the 1nn DTW implemented with accuracy 95.83%

In [228]:
# Training the K-NN model on the Training set
#from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 1, algorithm='ball_tree', metric='pyfunc', metric_params={'func':dtw,'window':10} )
classifier.fit(X_train, Y_train)

# Predicting the Test set results
Y_Pred_sklearn_dtw = classifier.predict(X_test)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, Y_Pred_sklearn_dtw)
ac = accuracy_score(Y_test, Y_Pred_sklearn_dtw)

  return self._fit(X, y)


In [229]:
print(ac)
print(cm)

0.9583333333333334
[[16  1  0]
 [ 0 15  0]
 [ 0  1 15]]


#### 14. Comparing with the 1NN DTW results earlier
###### Comparing the previously predicted classes with 1NN DTW  and with SKlearn DTW as metric, they are exacly similar with all the class labels being similarly predicted.

In [230]:
Y_Pred_df

Unnamed: 0,0
6,1.0
70,2.0
106,3.0
31,1.0
117,3.0
124,3.0
84,2.0
21,1.0
130,3.0
12,1.0


In [232]:
Y_Pred_sklearn_dtw_df = pd.DataFrame(Y_Pred_sklearn_dtw)
Y_Pred_sklearn_dtw_df

Unnamed: 0,0
0,1.0
1,2.0
2,3.0
3,1.0
4,3.0
5,3.0
6,2.0
7,1.0
8,3.0
9,1.0


#### 15.Comparing with Knn Euclidean distance
###### Comparison with KNN Euclidean and Sklearn KNN DTW shows difference in six labels. Euclidean incorrectly labels three Class 1 labels as Class 2, one class 3 label as CLass 2, one class 3 label as class 1 and one class 2 label as class 3 

In [241]:
df_comparisons = pd.read_excel(r"C:\Users\Procad\Desktop\ML_ASSIGNMENT\ML_PYTHON_ASS_1.xlsx")

In [242]:
df_comparisons

Unnamed: 0,Actual Class labels,Predicted using 1NN DTW implementation,Predicted using sklearn with Euclidean distance,Predicted using sklearn with DTW,Comparison 1NNDTW and sklearn DTW,Comparison sklearn Euclidean and sklearn DTW,Comparison Actual and 1NN DTW,Comparison Actual and Euclidean
0,1,1,1,1,SAME,SAME,SAME,SAME
1,2,2,2,2,SAME,SAME,SAME,SAME
2,3,3,3,3,SAME,SAME,SAME,SAME
3,1,1,2,1,SAME,DIFFERENT,SAME,DIFFERENT
4,3,3,3,3,SAME,SAME,SAME,SAME
5,3,3,3,3,SAME,SAME,SAME,SAME
6,2,2,2,2,SAME,SAME,SAME,SAME
7,1,1,1,1,SAME,SAME,SAME,SAME
8,3,3,2,3,SAME,DIFFERENT,SAME,DIFFERENT
9,1,1,1,1,SAME,SAME,SAME,SAME
