In [1]:
import sklearn
from sklearn import metrics
import numpy as np
import pandas as pd
import datetime
from pyts.approximation import SymbolicFourierApproximation
from datetime import datetime
from difflib import get_close_matches
from pygtrie import StringTrie
from os import listdir
from os.path import isfile, join

In [91]:
def classify(row, n_neigh):
    knn = get_close_matches(row["data"], trie, n=n_neigh, cutoff=0)
    true = 0
    weights_for_true = 1/row['positive_number']
    false = 0
    weights_for_negative = 1/row['negative_number']
    for n in knn:
        if trie.get(n):
            true += weights_for_true
        else:
            false += weights_for_negative
    trie[row["data"]] = row["class"]
    trieDate[row["data"]] = row["date"]
    return true > false

In [92]:
time_period = pd.date_range(start='2001-08-07', end='2015-12-31')
mypath = '../data/input_log'
files = list(sorted([f for f in listdir(mypath) if isfile(join(mypath, f))]))

In [93]:
files

['.DS_Store',
 'sfa_2_2.csv',
 'sfa_2_3.csv',
 'sfa_2_4.csv',
 'sfa_3_2.csv',
 'sfa_3_4.csv',
 'sfa_4_2.csv',
 'sfa_4_4.csv',
 'sfa_5_2.csv',
 'sfa_5_4.csv']

In [94]:
for file in files[1:]:
    # read data
    df = pd.read_csv(join(mypath, file))
    df['positive_number'] = df['class'].cumsum()
    df['negative_number'] = (1-df['class']).cumsum()
    df['date'] = df['date'].astype('datetime64')
    df = df[df['date'].isin(time_period)]
    
    # split on train, test, validate sets
    df_train = df.iloc[:int(0.6*df.shape[0])]
    df_validate = df.iloc[int(0.6*df.shape[0]):int(0.8*df.shape[0])]
    df_test = df.iloc[int(0.8*df.shape[0]):]
    
    # do knn for n=range(3,6)
    print()
    print(file)
    for num_neig in range(3, 6):
        # make a prefix tree
        trie = StringTrie()
        trieDate = StringTrie()
        for i in range(0, df_train.shape[0]):
            trie[df_train["data"].iloc[i]] = df_train["class"].iloc[i]
            trieDate[df_train["data"].iloc[i]] = df_train["date"].iloc[i]
            
        # validate
        df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])
        
        # print metrics
        print("Num neigh ", num_neig)
        print(metrics.classification_report(df_validate['class'].values, df_validate['predicted_class'].values))
        
        # write results
        df_validate.to_csv("../data/validation_output/" + file[:-4] + "_"+ str(num_neig)+ ".csv")


sfa_2_2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.87      0.50      0.64       878
        True       0.17      0.57      0.26       156

    accuracy                           0.51      1034
   macro avg       0.52      0.54      0.45      1034
weighted avg       0.76      0.51      0.58      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.87      0.40      0.55       878
        True       0.16      0.65      0.26       156

    accuracy                           0.44      1034
   macro avg       0.52      0.53      0.41      1034
weighted avg       0.76      0.44      0.51      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.87      0.69      0.77       878
        True       0.19      0.40      0.26       156

    accuracy                           0.64      1034
   macro avg       0.53      0.55      0.51      1034
weighted avg       0.76      0.64      0.69      1034


sfa_2_3.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.87      0.49      0.62       878
        True       0.17      0.59      0.26       156

    accuracy                           0.50      1034
   macro avg       0.52      0.54      0.44      1034
weighted avg       0.76      0.50      0.57      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.87      0.38      0.53       878
        True       0.16      0.68      0.26       156

    accuracy                           0.43      1034
   macro avg       0.52      0.53      0.40      1034
weighted avg       0.76      0.43      0.49      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.86      0.70      0.77       878
        True       0.17      0.35      0.23       156

    accuracy                           0.65      1034
   macro avg       0.52      0.53      0.50      1034
weighted avg       0.75      0.65      0.69      1034


sfa_2_4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.87      0.49      0.63       878
        True       0.17      0.59      0.27       156

    accuracy                           0.51      1034
   macro avg       0.52      0.54      0.45      1034
weighted avg       0.77      0.51      0.58      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.88      0.39      0.54       878
        True       0.17      0.70      0.27       156

    accuracy                           0.43      1034
   macro avg       0.52      0.54      0.40      1034
weighted avg       0.77      0.43      0.50      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.87      0.72      0.78       878
        True       0.19      0.38      0.25       156

    accuracy                           0.66      1034
   macro avg       0.53      0.55      0.52      1034
weighted avg       0.76      0.66      0.70      1034


sfa_3_2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.86      0.45      0.59       878
        True       0.16      0.58      0.25       156

    accuracy                           0.47      1034
   macro avg       0.51      0.52      0.42      1034
weighted avg       0.75      0.47      0.54      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.86      0.36      0.51       878
        True       0.16      0.67      0.26       156

    accuracy                           0.41      1034
   macro avg       0.51      0.52      0.38      1034
weighted avg       0.76      0.41      0.47      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.85      0.69      0.76       878
        True       0.15      0.32      0.21       156

    accuracy                           0.63      1034
   macro avg       0.50      0.50      0.49      1034
weighted avg       0.75      0.63      0.68      1034


sfa_3_4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.88      0.49      0.63       878
        True       0.18      0.62      0.27       156

    accuracy                           0.51      1034
   macro avg       0.53      0.55      0.45      1034
weighted avg       0.77      0.51      0.57      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.88      0.37      0.52       878
        True       0.17      0.71      0.27       156

    accuracy                           0.42      1034
   macro avg       0.52      0.54      0.40      1034
weighted avg       0.77      0.42      0.49      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.86      0.71      0.78       878
        True       0.18      0.35      0.23       156

    accuracy                           0.66      1034
   macro avg       0.52      0.53      0.51      1034
weighted avg       0.76      0.66      0.70      1034


sfa_4_2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.84      0.45      0.59       878
        True       0.14      0.51      0.22       156

    accuracy                           0.46      1034
   macro avg       0.49      0.48      0.40      1034
weighted avg       0.73      0.46      0.53      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.85      0.35      0.50       878
        True       0.15      0.65      0.25       156

    accuracy                           0.40      1034
   macro avg       0.50      0.50      0.37      1034
weighted avg       0.74      0.40      0.46      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.86      0.69      0.76       878
        True       0.17      0.37      0.23       156

    accuracy                           0.64      1034
   macro avg       0.51      0.53      0.50      1034
weighted avg       0.76      0.64      0.68      1034


sfa_4_4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.86      0.45      0.59       878
        True       0.16      0.58      0.25       156

    accuracy                           0.47      1034
   macro avg       0.51      0.51      0.42      1034
weighted avg       0.75      0.47      0.54      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.86      0.34      0.49       878
        True       0.16      0.70      0.26       156

    accuracy                           0.40      1034
   macro avg       0.51      0.52      0.37      1034
weighted avg       0.76      0.40      0.46      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.86      0.65      0.74       878
        True       0.16      0.38      0.23       156

    accuracy                           0.61      1034
   macro avg       0.51      0.52      0.48      1034
weighted avg       0.75      0.61      0.66      1034


sfa_5_2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.88      0.51      0.64       878
        True       0.18      0.60      0.27       156

    accuracy                           0.52      1034
   macro avg       0.53      0.55      0.46      1034
weighted avg       0.77      0.52      0.59      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.89      0.39      0.54       878
        True       0.18      0.73      0.28       156

    accuracy                           0.44      1034
   macro avg       0.53      0.56      0.41      1034
weighted avg       0.78      0.44      0.50      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  5
              precision    recall  f1-score   support

       False       0.86      0.70      0.77       878
        True       0.17      0.34      0.22       156

    accuracy                           0.64      1034
   macro avg       0.51      0.52      0.50      1034
weighted avg       0.75      0.64      0.69      1034


sfa_5_4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  3
              precision    recall  f1-score   support

       False       0.86      0.45      0.59       878
        True       0.16      0.57      0.24       156

    accuracy                           0.47      1034
   macro avg       0.51      0.51      0.42      1034
weighted avg       0.75      0.47      0.54      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])


Num neigh  4
              precision    recall  f1-score   support

       False       0.87      0.36      0.51       878
        True       0.16      0.71      0.27       156

    accuracy                           0.41      1034
   macro avg       0.52      0.53      0.39      1034
weighted avg       0.77      0.41      0.47      1034

Num neigh  5
              precision    recall  f1-score   support

       False       0.85      0.68      0.76       878
        True       0.16      0.35      0.22       156

    accuracy                           0.63      1034
   macro avg       0.51      0.51      0.49      1034
weighted avg       0.75      0.63      0.67      1034



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validate["predicted_class"] = df_validate.apply(classify, axis=1 , args=[num_neig])
