# Advanced Machine Learning 2nd Project
### Authors: Guilherme Cepeda - 62931, Pedro Serrano - 54853


In [9]:
#imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.metrics import accuracy_score
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.piecewise import PiecewiseAggregateApproximation, SymbolicAggregateApproximation


### Load Data 

In [6]:
#creates a dataframe from a file
df_trainset = pd.read_csv("worms_trainset.csv")

df_testset = pd.read_csv("worms_testset.csv")

#info
print(df_trainset.info())

#info
print(df_testset.info())

print(df_trainset.shape)
print(df_testset.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Columns: 901 entries, 1.0 to -0.50553484
dtypes: float64(901)
memory usage: 1.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Columns: 901 entries, 1.0 to -0.32321224.1
dtypes: float64(901)
memory usage: 535.1 KB
None
(180, 901)
(76, 901)


In [15]:
df_trainset

Unnamed: 0,1.0,1.6605051,1.7390916,1.8127664,1.847148,1.9011762,1.9355578,1.9060879,1.9453811,1.9257345,...,-0.36309684.3,-0.4220367.2,-0.39747843.2,-0.40239008.3,-0.40239008.4,-0.40239008.5,-0.44168332.1,-0.43186001.1,-0.49571153.1,-0.50553484
0,1.0,-0.379133,0.242145,-0.517195,-0.033979,0.587299,-0.517195,-0.172040,0.035052,0.518269,...,-3.485523,-3.623585,-2.311998,-3.278430,-2.864245,-3.278430,-3.002307,-2.864245,-2.726183,-3.071337
1,1.0,0.534425,0.444349,0.399312,0.511906,0.669539,0.714577,0.511906,0.692058,0.489387,...,3.799677,3.934791,3.754639,4.024867,3.799677,3.777158,3.777158,3.574488,3.844715,3.574488
2,1.0,-2.438882,-2.412564,-2.438882,-2.333611,-2.267818,-2.307294,-2.412564,-2.162547,-2.241500,...,-1.715148,-1.767783,-1.794101,-1.688830,-1.701989,-1.701989,-1.767783,-1.767783,-1.754624,-1.767783
3,1.0,1.601259,1.601259,1.589440,1.589440,1.589440,1.589440,1.589440,1.577622,1.577622,...,3.445002,3.208625,3.244082,3.125893,3.031342,2.948610,2.854059,2.783146,2.794965,2.854059
4,1.0,0.998721,0.931575,1.014708,1.014708,1.014708,1.027498,1.056275,1.097841,1.113828,...,-1.194709,-1.156340,-1.114773,-1.073207,-1.092391,-1.076404,-1.015653,-0.986877,-1.044430,-1.025246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2.0,-0.816431,-0.804662,-0.706590,-0.612441,-0.624210,-0.624210,-0.628132,-0.635978,-0.518292,...,-1.769692,-1.757923,-1.734386,-1.714772,-1.714772,-1.718695,-1.726540,-1.734386,-1.742232,-1.730463
176,2.0,-3.739104,-3.719033,-3.731076,-3.715019,-3.755160,-3.751146,-3.803328,-3.799314,-3.731076,...,0.186612,0.150486,0.126402,0.182598,0.190626,0.174570,0.210696,0.210696,0.210696,0.250836
177,2.0,-1.010301,-1.151468,-1.201885,-1.232135,-1.332969,-1.353136,-1.403553,-1.383386,-1.393470,...,1.702134,1.702134,1.712217,1.712217,1.712217,1.722301,1.722301,1.732384,1.732384,1.732384
178,2.0,1.511671,1.577663,1.569414,1.618907,1.618907,1.602410,1.635405,1.511671,1.552916,...,0.455808,0.505302,0.455808,0.381568,0.406315,0.315577,0.257834,0.266083,0.183594,0.142349


### Exploratory Data Analysis (EDA)

In [29]:
#check for duplicates training set
print("Train set duplicates:",df_trainset.duplicated().sum())

#check for duplicates test set
print("Test set duplicates:",df_testset.duplicated().sum())

#check for null values in the entire train set dataframe
print(df_trainset.isnull().any().any())

#check for null values in the entire test set dataframe
print(df_testset.isnull().any().any())


Train set duplicates: 23
Test set duplicates: 0
False
False


In [24]:
#statistical info of the train data
df_trainset.describe()


Unnamed: 0,1.0,1.6605051,1.7390916,1.8127664,1.847148,1.9011762,1.9355578,1.9060879,1.9453811,1.9257345,...,-0.36309684.3,-0.4220367.2,-0.39747843.2,-0.40239008.3,-0.40239008.4,-0.40239008.5,-0.44168332.1,-0.43186001.1,-0.49571153.1,-0.50553484
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,...,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,1.583333,-0.064291,-0.065017,-0.063687,-0.054981,-0.057459,-0.065366,-0.071222,-0.075473,-0.07004,...,0.099688,0.104324,0.105246,0.091169,0.090643,0.078197,0.073547,0.067658,0.067752,0.058794
std,0.494382,1.228183,1.212972,1.207888,1.206972,1.206928,1.205454,1.210743,1.208166,1.204137,...,1.283225,1.287452,1.267068,1.26823,1.267895,1.277308,1.263614,1.260002,1.281711,1.282489
min,1.0,-3.739104,-3.719033,-3.731076,-3.715019,-3.75516,-3.751146,-3.803328,-3.799314,-3.731076,...,-3.485523,-3.623585,-3.027088,-3.27843,-3.00373,-3.27843,-3.002307,-2.97036,-2.957012,-3.071337
25%,1.0,-0.854008,-0.85464,-0.80923,-0.812794,-0.863853,-0.82571,-0.859066,-0.883388,-0.886304,...,-0.922936,-0.885312,-0.87703,-0.877714,-0.859223,-0.860616,-0.853297,-0.934296,-0.921727,-0.936957
50%,2.0,-0.003233,-0.034666,-0.060338,-0.037785,-0.021782,-0.019039,0.001231,-0.042209,-0.014604,...,0.109461,0.073305,0.03448,0.049945,0.036629,0.021704,0.01985,0.047611,0.03426,0.033797
75%,2.0,0.750203,0.699233,0.749601,0.831119,0.85415,0.837433,0.845306,0.8344,0.836678,...,1.136707,1.173762,1.119435,1.084915,1.090188,1.09796,1.051917,1.065731,1.096654,1.09242
max,2.0,3.482405,3.29594,3.109476,2.923011,2.736547,2.603414,2.583677,2.563941,2.542011,...,3.799677,3.934791,3.754639,4.024867,3.799677,3.777158,3.777158,3.574488,3.844715,3.574488


In [25]:
#statistical info of the test data
df_testset.describe()

Unnamed: 0,1.0,-0.7785892,-0.74443593,-0.72546189,-0.70269304,-0.71407746,-0.72166708,-0.73305151,-0.71787227,-0.73684631,...,0.082832227,0.052473763.1,-0.042396439,-0.09931856,-0.12588222.1,-0.19039395.1,-0.22834203.3,-0.20557319.1,-0.27387973,-0.32321224.1
count,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,...,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0
mean,1.578947,-0.192069,-0.189175,-0.181068,-0.178807,-0.17688,-0.17095,-0.17242,-0.180324,-0.180246,...,-0.159047,-0.152441,-0.15985,-0.144918,-0.13777,-0.132926,-0.123438,-0.121947,-0.115584,-0.10817
std,0.497009,1.484918,1.440988,1.429251,1.39545,1.376559,1.361046,1.348735,1.341573,1.317002,...,1.089897,1.074808,1.074747,1.069314,1.067825,1.073469,1.061297,1.066026,1.062887,1.057805
min,1.0,-4.539753,-4.34627,-4.447618,-4.355484,-4.198854,-4.189641,-4.088292,-3.996157,-3.894809,...,-2.860785,-2.599652,-2.559477,-2.418867,-2.401751,-2.437829,-2.469898,-2.526019,-2.550071,-2.558089
25%,1.0,-1.181181,-1.163783,-1.130743,-1.055898,-1.049756,-1.03208,-1.014404,-1.013783,-1.076068,...,-1.056722,-1.011115,-0.960379,-0.934866,-0.905485,-0.833503,-0.817354,-0.837798,-0.88916,-0.925454
50%,2.0,-0.210207,-0.153675,-0.131455,-0.138355,-0.136446,-0.139662,-0.172429,-0.192525,-0.163554,...,-0.155121,-0.144021,-0.172759,-0.124781,-0.143766,-0.12001,-0.14493,-0.166109,-0.146595,-0.079036
75%,2.0,0.761011,0.77078,0.767794,0.669938,0.575837,0.594272,0.662421,0.659972,0.574929,...,0.491084,0.584827,0.58382,0.59097,0.573023,0.579136,0.582531,0.614534,0.630994,0.688149
max,2.0,3.735155,3.604026,3.717671,3.437931,3.542833,3.534091,3.402963,3.455414,3.210642,...,2.303505,2.281459,2.272302,2.366447,2.466869,2.592396,2.504527,2.510804,2.535909,2.523356


#### Check for Outliers
Identify outliers and anomalies in the data.

In [30]:
#calculate the z-score for each point of the training set
z_scores = np.abs((df_trainset - df_trainset.mean()) / df_trainset.std())

#define a threshold value
threshold = 3 # its considered an outiler when the value of the point is 3 * mean of the training set, so the threshold is 3

#Identify the outliers
outliers = df_trainset[z_scores > threshold]

#Count the number of outliers
num_outliers = outliers.count().sum()


print(f"outliers \n {outliers} \n") # non null values represent the outliers
print(f"outliers count \n {num_outliers} \n")

outliers 
      1.0  1.6605051  1.7390916  1.8127664  1.847148  1.9011762  1.9355578  \
0    NaN        NaN        NaN        NaN       NaN        NaN        NaN   
1    NaN        NaN        NaN        NaN       NaN        NaN        NaN   
2    NaN        NaN        NaN        NaN       NaN        NaN        NaN   
3    NaN        NaN        NaN        NaN       NaN        NaN        NaN   
4    NaN        NaN        NaN        NaN       NaN        NaN        NaN   
..   ...        ...        ...        ...       ...        ...        ...   
175  NaN        NaN        NaN        NaN       NaN        NaN        NaN   
176  NaN        NaN  -3.719033  -3.731076 -3.715019   -3.75516  -3.751146   
177  NaN        NaN        NaN        NaN       NaN        NaN        NaN   
178  NaN        NaN        NaN        NaN       NaN        NaN        NaN   
179  NaN        NaN        NaN        NaN       NaN        NaN        NaN   

     1.9060879  1.9453811  1.9257345  ...  -0.36309684.3  -0.422

### Data Processing

In [None]:
# MAYBE NORMALIZE THE DATA TO COMPARE TIME SERIES NOT SURE YET , BUT CHECK TP6 AAA
# eliminate duplicates, check if it has an impact on the results afterwards

### Best Model/Representation Method for Classification


The KNeighborsTimeSeriesClassifier model implements the k-nearest neighbor for time series. 

We have three possible metrics, as seen below in comments
* 1-NN with Euclidean distance
* 1-NN with DTW
* 1-NN with SAX, in this case you need to set two other parameters: `n_segments` and `alphabet_size_avg`. The first parameter means the number of Piecewise Aggregate Approximation pieces to compute (start by fixing it at 16) and the latter is the number of SAX symbols to use (start by fixing it at 10). To fix these parameters, you need to use the parameter `metric_params` in the class of the classifier and provide a dictionary with the two parameters required.

We are going to use the accuracy score (from scikit-learn) to compare the methods. Also, our data is already splitted in train and test set, so we don't need to worry about splitting our data.

In [None]:
#K nearest Neighbors for time series

c_e = KNeighborsTimeSeriesClassifier(n_neighbors = 1, metric = 'euclidean')
#c_dtw = KNeighborsTimeSeriesClassifier(n_neighbors = 1, metric = 'dtw')
#dict = {'n_segments' : 16 , 'alphabet_size_avg': 10}
#c = KNeighborsTimeSeriesClassifier(n_neighbors = 1, metric = 'sax', metric_params = dict)

c_e.fit(X_train,y_train)
preds = c_e.predict(X_test)

accuracy = accuracy_score(y_test, preds)

print(accuracy)

Now, we are going to explore some representation methods, namely the Piecewise Aggregate Approximation (PAA) and the Symbolic Aggregate Approximation (SAX).

In [None]:
#piecewise aggregate Approximation

paa = PiecewiseAggregateApproximation(n_segments=16)

paa_X_train_data = paa.fit_transform(X_train)
print(paa_X_train_data.shape)
paa_X_test_data = paa.fit_transform(X_test)
print(paa_X_test_data.shape)


sk_c = KNeighborsClassifier(n_neighbors = 1)
sk_c.fit(paa_X_train_data[:,:,0] ,y_train)

preds = sk_c.predict(paa_X_test_data[:,:,0])

accuracy_paa = accuracy_score(y_test, preds)

print(accuracy_paa)


#symbolic aggregate Approximation

sax = SymbolicAggregateApproximation(n_segments=16, alphabet_size_avg=40)
sax_X_train_data = sax.fit_transform(X_train)
print(sax_X_train_data.shape)
sax_X_test_data = sax.fit_transform(X_test)
print(sax_X_test_data.shape)


sk_c = KNeighborsClassifier(n_neighbors = 1)

sk_c.fit(sax_X_train_data[:,:,0] ,y_train)

preds = sk_c.predict(sax_X_test_data[:,:,0])

accuracy_sax = accuracy_score(y_test, preds)

print(accuracy_sax)


In [None]:
#Question 2 of the project is most likely to be done using the info in the TP7 timeseries forecasting i believe 

#histograms in EDA its  NO GO, too many rows in the datasets
#Same for presenting the plot for outliers, FIND ANOTHER WAY, counting them its a solution

#DONT KNOW if i have to check the outliers of the test set also 
#VERIFY if a matrix of correlations is possibleor not, sounds difficult as we dont have a target variable

#Scaling data might need to be done VERIFY  the 4 Scalers PowerTransformer, StandardScaler , MinMaxScaler and the last one is the normalizer i believe it might be useful as it works with rows check AA notes/slides, no need for imputation though
#in the Project statement when she says find the best classifier model, in the TP6 she only uses the KNeighborsClassifier CHECK if its possible to use other classification models or if its even needed to

