# Advanced Machine Learning 2nd Project
### Authors: Guilherme Cepeda - 62931, Pedro Serrano - 54853


In [52]:
#imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.metrics import accuracy_score
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.piecewise import PiecewiseAggregateApproximation, SymbolicAggregateApproximation
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Load Data 

In [2]:
#creates a dataframe from a file

#the csv has no column names, so we have to add them
list=['class']
for i in range(1,901):
    list.append('t' + str(i))

#we created a list [class, t0 - t900] to represent the 901 columns we have
#names= list makes this list as a header to the dataframe
df_trainset = pd.read_csv("worms_trainset.csv", names=list)

df_testset = pd.read_csv("worms_testset.csv", names=list)

#info
print(df_trainset.info())

#info
print(df_testset.info())

print(df_trainset.shape)
print(df_testset.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Columns: 901 entries, class to t900
dtypes: float64(901)
memory usage: 1.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Columns: 901 entries, class to t900
dtypes: float64(901)
memory usage: 542.1 KB
None
(181, 901)
(77, 901)


In [6]:
#just checking if everything is alright
print(df_trainset.columns.tolist())

print("\n")

print(df_testset.columns.tolist())

['class', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', 't11', 't12', 't13', 't14', 't15', 't16', 't17', 't18', 't19', 't20', 't21', 't22', 't23', 't24', 't25', 't26', 't27', 't28', 't29', 't30', 't31', 't32', 't33', 't34', 't35', 't36', 't37', 't38', 't39', 't40', 't41', 't42', 't43', 't44', 't45', 't46', 't47', 't48', 't49', 't50', 't51', 't52', 't53', 't54', 't55', 't56', 't57', 't58', 't59', 't60', 't61', 't62', 't63', 't64', 't65', 't66', 't67', 't68', 't69', 't70', 't71', 't72', 't73', 't74', 't75', 't76', 't77', 't78', 't79', 't80', 't81', 't82', 't83', 't84', 't85', 't86', 't87', 't88', 't89', 't90', 't91', 't92', 't93', 't94', 't95', 't96', 't97', 't98', 't99', 't100', 't101', 't102', 't103', 't104', 't105', 't106', 't107', 't108', 't109', 't110', 't111', 't112', 't113', 't114', 't115', 't116', 't117', 't118', 't119', 't120', 't121', 't122', 't123', 't124', 't125', 't126', 't127', 't128', 't129', 't130', 't131', 't132', 't133', 't134', 't135', 't136', 't137', 't

In [7]:
#statistical info of the data
df_trainset.describe()

Unnamed: 0,class,t1,t2,t3,t4,t5,t6,t7,t8,t9,...,t891,t892,t893,t894,t895,t896,t897,t898,t899,t900
count,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,...,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0
mean,1.58011,-0.054762,-0.055049,-0.05332,-0.044472,-0.046638,-0.054311,-0.060297,-0.064308,-0.059014,...,0.097132,0.101416,0.102469,0.088442,0.087919,0.075541,0.070701,0.064898,0.064639,0.055676
std,0.49491,1.231458,1.217009,1.212576,1.21189,1.212344,1.211266,1.216288,1.214133,1.209916,...,1.280118,1.284467,1.264096,1.265234,1.264899,1.274256,1.260681,1.257045,1.278832,1.279609
min,1.0,-3.739104,-3.719033,-3.731076,-3.715019,-3.75516,-3.751146,-3.803328,-3.799314,-3.731076,...,-3.485523,-3.623585,-3.027088,-3.27843,-3.00373,-3.27843,-3.002307,-2.97036,-2.957012,-3.071337
25%,1.0,-0.854008,-0.831337,-0.80441,-0.80441,-0.855404,-0.822698,-0.85016,-0.883371,-0.8709,...,-0.904257,-0.8828,-0.87703,-0.874598,-0.849661,-0.846069,-0.849122,-0.934296,-0.920635,-0.931001
50%,2.0,0.00437,-0.034666,-0.055647,-0.033979,-0.012762,0.003513,0.003733,-0.038728,-0.014604,...,0.101768,0.072891,0.021509,0.045957,0.011206,0.016229,-0.004682,0.035938,0.029757,0.029757
75%,2.0,0.764857,0.70145,0.795012,0.845768,0.863678,0.849089,0.857694,0.859221,0.853187,...,1.130934,1.173048,1.112793,1.082254,1.079085,1.094289,1.049968,1.053081,1.072255,1.081842
max,2.0,3.482405,3.29594,3.109476,2.923011,2.736547,2.603414,2.583677,2.563941,2.542011,...,3.799677,3.934791,3.754639,4.024867,3.799677,3.777158,3.777158,3.574488,3.844715,3.574488


In [54]:
#Here, we split the data into X and y
#y is the target variable 'class', and X is everything else

y_train = df_trainset['class']
#X_test = X_test.transpose()
X_train = df_trainset.drop(['class'], axis=1)
y_test = df_testset['class']
#y_test = y_test.transpose()
X_test = df_testset.drop(['class'], axis=1)

print("X_train:\n")
print(X_train)
print("\ny_train:\n")
print(y_train)
print("\nX_test:\n")
print(X_test)
print("\ny_test:\n")
print(y_test)
print(X_train.shape)


X_train:

           t1        t2        t3        t4        t5        t6        t7  \
0    1.660505  1.739092  1.812766  1.847148  1.901176  1.935558  1.906088   
1   -0.379133  0.242145 -0.517195 -0.033979  0.587299 -0.517195 -0.172040   
2    0.534425  0.444349  0.399312  0.511906  0.669539  0.714577  0.511906   
3   -2.438882 -2.412564 -2.438882 -2.333611 -2.267818 -2.307294 -2.412564   
4    1.601259  1.601259  1.589440  1.589440  1.589440  1.589440  1.589440   
..        ...       ...       ...       ...       ...       ...       ...   
176 -0.816431 -0.804662 -0.706590 -0.612441 -0.624210 -0.624210 -0.628132   
177 -3.739104 -3.719033 -3.731076 -3.715019 -3.755160 -3.751146 -3.803328   
178 -1.010301 -1.151468 -1.201885 -1.232135 -1.332969 -1.353136 -1.403553   
179  1.511671  1.577663  1.569414  1.618907  1.618907  1.602410  1.635405   
180  0.732443  0.698494  0.694251  0.673033  0.711225  0.723955  0.728199   

           t8        t9       t10  ...      t891      t892      t

### Exploratory Data Analysis (EDA)

In [15]:
#We dont think duplicates are bad here, we just want to be aware of them

#check for duplicates training set
print("Train set duplicates:",df_trainset.duplicated().sum())

#check for duplicates test set
print("Test set duplicates:",df_testset.duplicated().sum())

#Checking for nulls - there are none

#check for null values in the entire train set dataframe
print(df_trainset.isnull().any().any())

#check for null values in the entire test set dataframe
print(df_testset.isnull().any().any())


Train set duplicates: 23
Test set duplicates: 0
False
False


In [43]:
#statistical info of the train data
X_train.describe()


Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t891,t892,t893,t894,t895,t896,t897,t898,t899,t900
count,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,...,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0,181.0
mean,-0.054762,-0.055049,-0.05332,-0.044472,-0.046638,-0.054311,-0.060297,-0.064308,-0.059014,-0.060587,...,0.097132,0.101416,0.102469,0.088442,0.087919,0.075541,0.070701,0.064898,0.064639,0.055676
std,1.231458,1.217009,1.212576,1.21189,1.212344,1.211266,1.216288,1.214133,1.209916,1.214343,...,1.280118,1.284467,1.264096,1.265234,1.264899,1.274256,1.260681,1.257045,1.278832,1.279609
min,-3.739104,-3.719033,-3.731076,-3.715019,-3.75516,-3.751146,-3.803328,-3.799314,-3.731076,-3.807342,...,-3.485523,-3.623585,-3.027088,-3.27843,-3.00373,-3.27843,-3.002307,-2.97036,-2.957012,-3.071337
25%,-0.854008,-0.831337,-0.80441,-0.80441,-0.855404,-0.822698,-0.85016,-0.883371,-0.8709,-0.896727,...,-0.904257,-0.8828,-0.87703,-0.874598,-0.849661,-0.846069,-0.849122,-0.934296,-0.920635,-0.931001
50%,0.00437,-0.034666,-0.055647,-0.033979,-0.012762,0.003513,0.003733,-0.038728,-0.014604,-0.003048,...,0.101768,0.072891,0.021509,0.045957,0.011206,0.016229,-0.004682,0.035938,0.029757,0.029757
75%,0.764857,0.70145,0.795012,0.845768,0.863678,0.849089,0.857694,0.859221,0.853187,0.819685,...,1.130934,1.173048,1.112793,1.082254,1.079085,1.094289,1.049968,1.053081,1.072255,1.081842
max,3.482405,3.29594,3.109476,2.923011,2.736547,2.603414,2.583677,2.563941,2.542011,2.522274,...,3.799677,3.934791,3.754639,4.024867,3.799677,3.777158,3.777158,3.574488,3.844715,3.574488


In [46]:
#statistical info of the test data
X_test.describe()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t891,t892,t893,t894,t895,t896,t897,t898,t899,t900
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,...,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,-0.199686,-0.196386,-0.188138,-0.185611,-0.183856,-0.178102,-0.179701,-0.187306,-0.187474,-0.183142,...,-0.155905,-0.14978,-0.158325,-0.144326,-0.137616,-0.133673,-0.124801,-0.123033,-0.117639,-0.110963
std,1.47663,1.432874,1.421172,1.387524,1.368842,1.353518,1.341355,1.334124,1.309845,1.304831,...,1.083054,1.067969,1.067737,1.062268,1.060778,1.066403,1.05436,1.059032,1.056025,1.051109
min,-4.539753,-4.34627,-4.447618,-4.355484,-4.198854,-4.189641,-4.088292,-3.996157,-3.894809,-3.811888,...,-2.860785,-2.599652,-2.559477,-2.418867,-2.401751,-2.437829,-2.469898,-2.526019,-2.550071,-2.558089
25%,-1.164998,-1.161052,-1.129835,-1.041658,-1.043875,-1.023699,-1.009225,-1.010499,-1.065569,-1.005302,...,-1.049814,-1.010397,-0.942657,-0.93257,-0.902307,-0.820113,-0.789454,-0.803023,-0.869985,-0.92285
50%,-0.232664,-0.221682,-0.179368,-0.185413,-0.166084,-0.149459,-0.185413,-0.199637,-0.184009,-0.148845,...,-0.146548,-0.121943,-0.168377,-0.117868,-0.125882,-0.164813,-0.228342,-0.183413,-0.170054,-0.099024
75%,0.758231,0.767264,0.747506,0.634862,0.513345,0.544512,0.638011,0.65582,0.542029,0.504104,...,0.476362,0.584308,0.573599,0.580146,0.549823,0.562566,0.55512,0.614483,0.622132,0.67913
max,3.735155,3.604026,3.717671,3.437931,3.542833,3.534091,3.402963,3.455414,3.210642,3.184416,...,2.303505,2.281459,2.272302,2.366447,2.466869,2.592396,2.504527,2.510804,2.535909,2.523356


#### Check for Outliers
Identify outliers and anomalies in the data.

In [14]:
#calculate the z-score for each point of the training set
z_scores = np.abs((df_trainset - df_trainset.mean()) / df_trainset.std())

#define a threshold value
threshold = 3 # its considered an outiler when the value of the point is 3 * mean of the training set, so the threshold is 3

#Identify the outliers
outliers = df_trainset[z_scores > threshold]

#Count the number of outliers
num_outliers = outliers.count().sum()


print(f"outliers \n {outliers} \n") # non null values represent the outliers
print(f"outliers count \n {num_outliers} \n")
'''
For now, vou esquever isto
Vou normalizar o data set, e depois, se tivermos más classificações, volot a isto

'''

outliers 
      class  t1        t2        t3        t4       t5        t6        t7  \
0      NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
1      NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
2      NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
3      NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
4      NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
..     ...  ..       ...       ...       ...      ...       ...       ...   
176    NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
177    NaN NaN -3.719033 -3.731076 -3.715019 -3.75516 -3.751146 -3.803328   
178    NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
179    NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   
180    NaN NaN       NaN       NaN       NaN      NaN       NaN       NaN   

           t8        t9  ...  t891  t892  t893      t894  t895  

'\nFor now, vou esquever isto\nVou normalizar o data set, e depois, se tivermos más classificações, volot a isto\n\n'

In [41]:
#creates a matrix of correlations
corr_matrix = df_trainset.corr() 
#how much each attribute correlates with the Class target variable value, the lower the value the least relevant the feature is
print("\nCorrelations Matrix\n")
print(corr_matrix['class'].sort_values(ascending=False))#to present all columns their type cannot be object so we must convert it to float

#plot to present the correlation between the 2 most correlated features with target variable
#sns.pairplot(df, vars=['Y9', 'Y10'], hue='Class')


Correlations Matrix

class    1.000000
t56      0.183297
t54      0.181402
t55      0.177794
t57      0.175693
           ...   
t219    -0.206140
t216    -0.209983
t215    -0.211973
t218    -0.216237
t217    -0.217713
Name: class, Length: 901, dtype: float64


### Data Processing

In [32]:
#NORMALIZE THE DATA TO COMPARE TIME SERIES 

X_train_normalized = TimeSeriesScalerMeanVariance(mu=0, std=1).fit_transform(X_train)

X_test_normalized = TimeSeriesScalerMeanVariance(mu=0, std=1).fit_transform(X_test)


print(X_train_normalized)
print("\n")
print(X_test_normalized)


[[[ 1.66142837]
  [ 1.74005857]
  [ 1.81377433]
  ...
  [-0.43210013]
  [-0.49598715]
  [-0.50581593]]

 [[-0.37934378]
  [ 0.24227965]
  [-0.51748231]
  ...
  [-2.86583748]
  [-2.72769891]
  [-3.07304522]]

 [[ 0.53472256]
  [ 0.44459656]
  [ 0.39953356]
  ...
  [ 3.57647509]
  [ 3.84685304]
  [ 3.57647509]]

 ...

 [[-1.01086234]
  [-1.15210843]
  [-1.20255346]
  ...
  [ 1.73334731]
  [ 1.73334731]
  [ 1.73334731]]

 [[ 1.51251182]
  [ 1.57853991]
  [ 1.57028643]
  ...
  [ 0.26623091]
  [ 0.18369575]
  [ 0.14242817]]

 [[ 0.73284978]
  [ 0.69888245]
  [ 0.69463653]
  ...
  [-0.6088596 ]
  [-0.60036777]
  [-0.61735143]]]


[[[-0.77902211]
  [-0.74484985]
  [-0.72586526]
  ...
  [-0.20568749]
  [-0.27403201]
  [-0.32339195]]

 [[ 0.01728157]
  [-0.07194975]
  [-0.08358862]
  ...
  [ 0.78932646]
  [ 0.7970857 ]
  [ 0.80484494]]

 [[ 0.08350654]
  [ 0.07477053]
  [ 0.07913854]
  ...
  [ 0.49409897]
  [ 0.48973097]
  [ 0.47662696]]

 ...

 [[-4.54227759]
  [-4.34868671]
  [-4.45009146]
  

### Best Model/Representation Method for Classification


The KNeighborsTimeSeriesClassifier model implements the k-nearest neighbor for time series. 

We have three possible metrics, as seen below in comments
* 1-NN with Euclidean distance
* 1-NN with DTW
* 1-NN with SAX, in this case you need to set two other parameters: `n_segments` and `alphabet_size_avg`. The first parameter means the number of Piecewise Aggregate Approximation pieces to compute (start by fixing it at 16) and the latter is the number of SAX symbols to use (start by fixing it at 10). To fix these parameters, you need to use the parameter `metric_params` in the class of the classifier and provide a dictionary with the two parameters required.

We are going to use the accuracy score (from scikit-learn) to compare the methods. Also, our data is already splitted in train and test set, so we don't need to worry about splitting our data.

In [108]:
#K nearest Neighbors for time series

#c = KNeighborsTimeSeriesClassifier(n_neighbors = 3, metric = 'euclidean')#0.6103896103896104
c = KNeighborsTimeSeriesClassifier(n_neighbors = 1, metric = 'dtw') #0.6233766233766234
#dict = {'n_segments' : 16 , 'alphabet_size_avg': 10}
#c = KNeighborsTimeSeriesClassifier(n_neighbors = 1, metric = 'sax', metric_params = dict)#0.5974025974025974
#c = SVC(C=50,gamma='auto')
#c  = RandomForestClassifier(n_estimators=12, random_state=0)
#c = LogisticRegression(C = 0.01)
#c = DecisionTreeClassifier(criterion = 'gini')
#c = KNeighborsClassifier(n_neighbors = 5, algorithm = 'ball_tree',weights = 'distance')
#c = GaussianNB()

c.fit(X_train, y_train)
preds = c.predict(X_test)

#accuracy = accuracy_score(y_test, preds)

#print(accuracy)

precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
mcc = matthews_corrcoef(y_test, preds)


print("The Precision is: %7.4f" % precision)
print("The Recall is: %7.4f" % recall)
print("The F1 score is: %7.4f" % f1)
print("The Matthews correlation coefficient is: %7.4f" % mcc)
print()
print("This is the Confusion Matrix")
print(pd.DataFrame(confusion_matrix(y_test, preds)))

#Test randomForest , SVM, decision trees , KNN 

The Precision is:  0.5455
The Recall is:  0.7273
The F1 score is:  0.6234
The Matthews correlation coefficient is:  0.2727

This is the Confusion Matrix
    0   1
0  24   9
1  20  24


Now, we are going to explore some representation methods, namely the Piecewise Aggregate Approximation (PAA) and the Symbolic Aggregate Approximation (SAX).

In [57]:
#piecewise aggregate Approximation

paa = PiecewiseAggregateApproximation(n_segments=12)

paa_X_train_data = paa.fit_transform(X_train_normalized)
print(paa_X_train_data.shape)
paa_X_test_data = paa.fit_transform(X_test_normalized)
print(paa_X_test_data.shape)


sk_c = KNeighborsClassifier(n_neighbors = 1)
sk_c.fit(paa_X_train_data[:,:,0] ,y_train)

preds = sk_c.predict(paa_X_test_data[:,:,0])

accuracy_paa = accuracy_score(y_test, preds)

print(accuracy_paa)



(181, 10, 1)
(77, 10, 1)
0.6233766233766234


In [113]:
#symbolic aggregate Approximation

sax = SymbolicAggregateApproximation(n_segments=32, alphabet_size_avg=50)
sax_X_train_data = sax.fit_transform(X_train)
print(sax_X_train_data.shape)
sax_X_test_data = sax.fit_transform(X_test)
print(sax_X_test_data.shape)


sk_c = KNeighborsClassifier(n_neighbors = 1)

sk_c.fit(sax_X_train_data[:,:,0] ,y_train)

preds = sk_c.predict(sax_X_test_data[:,:,0])

print(sax_X_test_data[:,:,0].shape)

accuracy_sax = accuracy_score(y_test, preds)

print(accuracy_sax)

(181, 32, 1)
(77, 32, 1)
(77, 32)
0.6753246753246753




In [None]:
# Execute list of scalers, imputers,models and present the results 
def test_models (scalers, representation_models, models, X_train, y_train, X_test,y_test, verbose, show_model):
    results =[]
    ct = 0
    for name_scaler, scaler in scalers:
        for name_rep_method, rep_method in representation_models:
            for name_mod, model in models:
                #scaling
                scaler.fit(X_train)
                Xt_train = scaler.transform(X_train)
                Xt_test  = scaler.transform(X_test)

                #representation methods
                if verbose:
                    rep_method.fit(Xt_train)
                    Xt_train = rep_method.transform(Xt_train)
                    Xt_test  = rep_method.transform(Xt_test)


                model.fit(Xt_train[:,:,0], y_train) #[:,:,0] to have only 2 dimensions
                preds = model.predict(Xt_test[:,:,0]) #PREDICTION

                #present model number
                if show_model:
                    ct += 1
                    print("\nModel %d" % ct)

                #save results
                results = save_results (name_scaler, scaler, name_rep_method, rep_method, name_mod, model, results,y_test, preds, show_model)
    
    results_sorted = sorted(results, key=lambda x: x[8], reverse=True) #f1 sorted decreasing
    display_results(results_sorted, verbose)
    return results



# Save the model scores and present intermediate results (w/ verbose)
# Returns the list with the saved results 
def save_results(name_scaler, scaler,name_rep_method, rep_method, name_mod, model, results, y_test, preds, show_model):

    # Calculate the precision, recall, f1 and mcc scores
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)
    
    if show_model:
        print(f"Scaler: {scaler} imputer: {rep_method} classifier: {name_mod} {model}")
        print("The Precision is: %7.4f" % precision)
        print("The Recall is: %7.4f" % recall)
        print("The F1 score is: %7.4f" % f1)
        print("The Matthews correlation coefficient is: %7.4f" % mcc)
        print()
        print("This is the Confusion Matrix")
        print(pd.DataFrame(confusion_matrix(y_test, preds)))


    results.append((name_scaler,
                    scaler,
                    name_rep_method, 
                    rep_method, 
                    name_mod, 
                    model,
                    precision,
                    recall,
                    f1,
                    mcc,                    
                    ))
    return results

# Display the model final results. Receives the ordered results to present
def display_results (results, verbose):        
    
    noshow = ""
    print (f"\n--------------------------Results for Classification Models Performance--------------------------")
    for res in results:
        name_scaler = res [0]
        scaler = res [1]
        name_rep_method = res [2]
        rep_method = res [3]
        name_mod = res [4]
        model = res [5]
        precision = res [6]
        recall = res [7]
        f1 = res [8]
        mcc = res [9]

        if verbose:
            print(f"{name_mod.ljust(25)} | precision     {precision:.4f} | recall     {recall:.4f} | f1     {f1:.4f}| mcc     {mcc:.4f}")
            print(f"{noshow.ljust(25)} | scaler {scaler} | imputer {rep_method}")
        else:
             print(f"{name_mod.ljust(25)} | precision     {precision:.4f} | recall     {recall:.4f} | f1     {f1:.4f}| mcc     {mcc:.4f}")
             print(f"{noshow.ljust(25)} | scaler {scaler}")
    

In [None]:


# Defining a list of scalers
scalers = [
    ('PowerTransformer', PowerTransformer()),
    ('MinMaxScaler', MinMaxScaler()),
    ('StandardScaler', StandardScaler())
    ('TimeSeriesScalerMeanVariance', TimeSeriesScalerMeanVariance(mu=0, std=1))
]

representation_models = [
    ('PiecewiseAggregateApproximation_ns10', PiecewiseAggregateApproximation(n_segments=12)),
    ('PiecewiseAggregateApproximation_ns16', PiecewiseAggregateApproximation(n_segments=16)),
    ('SymbolicAggregateApproximation_ns10', SymbolicAggregateApproximation(n_segments=10, alphabet_size_avg=40)),
    ('SymbolicAggregateApproximation_ns10', SymbolicAggregateApproximation(n_segments=32, alphabet_size_avg=40))
]

dict = {'n_segments' : 16 , 'alphabet_size_avg': 10}

#TO USE WITH THE REPRESENTATION METHODS
# Defining a list of classification models, ALTER THE MODELS
classification_models = [
    ('LogisticRegression', LogisticRegression(C = 0.01)),
    ('DecisionTree_maxd10', DecisionTreeClassifier(max_depth = 10)),
    ('DecisionTree_minsl20', DecisionTreeClassifier(min_samples_leaf = 5)),
    ('DecisionTree_critgini', DecisionTreeClassifier(criterion = 'gini')),
    ('DecisionTree_critentropy', DecisionTreeClassifier(criterion = 'entropy')),
    ('GaussianNB', GaussianNB()),
    ('KNN_K1_wdist', KNeighborsClassifier(n_neighbors = 1,weights = 'distance')),
    ('KNN_K1_eu', KNeighborsClassifier(n_neighbors = 1, metric = 'euclidean')),
    ('KNN_K1_dtw', KNeighborsClassifier(n_neighbors = 1, metric = 'dtw')),
    ('KNN_K1_sax', KNeighborsClassifier(n_neighbors = 1, metric = 'sax',metric_params = dict)),
    ('RandomForestClassifier_ne50',RandomForestClassifier(n_estimators=50, random_state=0)),
    ('RandomForestClassifier_ne10',RandomForestClassifier(n_estimators=10, random_state=0)),
    ('SVC_c50',SVC(C=50,gamma='auto')),
    ('SVC_c10',SVC(C=10,gamma='auto'))
]

VERBOSE = False #false will not present the representation methods
SHOW_MODEL = True #True to present/print the progress of the model performance

#First run will be just with the scalers and the classification models
test_models (scalers, representation_models, classification_models, X_train, y_train, X_test,y_test, VERBOSE, SHOW_MODEL)

VERBOSE = True

#Second run, for representation models
test_models (scalers, representation_models, classification_models, X_train, y_train, X_test,y_test, VERBOSE, SHOW_MODEL)

In [None]:
#Question 2 of the project is most likely to be done using the info in the TP7 timeseries forecasting i believe 

#histograms in EDA its  NO GO, too many rows in the datasets
#Same for presenting the plot for outliers, FIND ANOTHER WAY, counting them its a solution

#DONT KNOW if i have to check the outliers of the test set also 
#VERIFY if a matrix of correlations is possibleor not, sounds difficult as we dont have a target variable

#Scaling data might need to be done VERIFY  the 4 Scalers PowerTransformer, StandardScaler , MinMaxScaler and the last one is the normalizer i believe it might be useful as it works with rows check AA notes/slides, no need for imputation though
#in the Project statement when she says find the best classifier model, in the TP6 she only uses the KNeighborsClassifier CHECK if its possible to use other classification models or if its even needed to

#SEE IF SCALING IS NEEDED
