In [None]:
import tensorflow as tf
import numpy as np
import os
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', size=16) 
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import warnings
import logging


tfk = tf.keras
tfkl = tf.keras.layers
print(tf.__version__)

In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
# Import data

x_train_val= np.load('/kaggle/input/time-series/x_train.npy')
y_train_val=np.load('/kaggle/input/time-series/y_train.npy')

In [None]:
print("X_train shape",x_train_val.shape)
print("Y_train shape",y_train_val.shape)

In [None]:
# map class numbers to labels

labels={0: "Wish",
1: "Another",
2: "Comfortably",
3: "Money",
4: "Breathe",
5: "Time",
6: "Brain",
7: "Echoes",
8: "Wearing",
9: "Sorrow",
10: "Hey",
11: "Shine"}

In [None]:
# UNDERSTANDING THE DATA : VISUALIZE

# At first it appears like: the data is represented on 36 pts time series, and for 
# each point of time there is a value for 6 different features; it is then 36 pts 
# time series of 6 different features

def plot_example(random_index, x,y ):
    example=x[random_index]
    example_label=y[random_index]

    n_points=example.shape[0] # 36
    n_features=example.shape[1] # 6

    counter=0
    fig, axs= plt.subplots(2, 3, figsize=(20,10))
    fig.suptitle('Category : '+labels[example_label])
    for i in range(n_features):
        row= counter//3
        col= counter%3
        example_on_ith_feature=example[:,i]
        axs[row,col].set_title('Feature ° '+str(i+1))
        axs[row,col].plot(example_on_ith_feature)
        counter=counter+1




random_index=np.random.randint(0,2428) 
plot_example(random_index,x_train_val,y_train_val)


In [None]:
# CLASS REPARTITIONS



def class_rep(y_train_val):
    
    class_repartitions={"Wish":0,
    "Another":0,
    "Comfortably":0,
    "Money":0,
    "Breathe":0,
    "Time":0,
    "Brain":0,
    "Echoes":0,
    "Wearing":0,
    "Sorrow":0,
    "Hey":0,
    "Shine":0}
    
    for y in y_train_val:
        label=labels[y]
        class_repartitions[label]=class_repartitions[label]+1
    
    S=0
    for key in class_repartitions:
        S=S+class_repartitions[key]


    print("TOTAL : ",S)
    plt.figure(figsize=(20,20))
    plt.bar(class_repartitions.keys(), class_repartitions.values(), color='g')
    return class_repartitions

class_repartitions=class_rep(y_train_val)
class_repartitions
# ===> UNBALANCED DATASET

In [None]:
n_total=x_train_val.shape[0]
n_classes=12


class_loss_weights = {
    class_number: (1 / class_repartitions[labels[class_number]]) * (n_total / 12) for class_number in range(12)
}

class_loss_weights

# Computing class loss weight (due to important imbalance) to eventually use it after.

In [None]:
# WORKING THE DATA IN AMOUNT

In [None]:
# PREPROCESSING

In [None]:
# Studiying each feature/variable separately (statistics, distribution, boxplots ...)

def flatten(x_train_val,y_train_val):
    x_train_val_flattened =[]
    y_train_val_flattened=[]
    n_total=len(x_train_val)
    
    for i in range(n_total):
        for k in range(36):
            x_train_val_flattened.append([])
            y_train_val_flattened.append(y_train_val[i])
            for f in range(6):
                x_train_val_flattened[len(x_train_val_flattened)-1].append(x_train_val[i,k,f])

    x_train_val_flattened=np.array(x_train_val_flattened) 
    return x_train_val_flattened,y_train_val_flattened

x_train_val_flattened, y_train_val_flattened = flatten(x_train_val,y_train_val) 
df=pd.DataFrame(x_train_val_flattened)
df["class"] = y_train_val_flattened
df["class_name"] = [ labels[y] for y in y_train_val_flattened]

In [None]:
df

In [None]:
# Some info on the dataset

df[[0,1,2,3,4,5]].describe()

In [None]:
# MinMaxScaling ===> impossible because outliers squashes almost every values in a very short interval.
# Taking outliers into account for scaling : ROBUST SCALING !!!

In [None]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler()
transformer.fit(x_train_val_flattened)
medians=transformer.center_
IQR=transformer.scale_
print("medians values : ",medians)
print("IQR ranges : ", IQR)


x_train_val_scaled = x_train_val.copy()

for i in range(n_total):
    x_train_val_scaled[i]=transformer.transform(x_train_val_scaled[i])

print()
print("Verification on a sample: ")
print(" original : " ,x_train_val[0][0])
print(" robust scaled : " ,x_train_val_scaled[0][0])
should_be = []
for f in range(6):
    s=(x_train_val[0][0][f] - medians[f])/IQR[f]
    should_be.append(s.round(7))
print(" should be : " ,should_be)
print()

print("Robust Scaling works well after verification.")

In [None]:
# Verification of the scaling impact on some examples

random_index=np.random.randint(0,2428) 

print("ORIGINAL :")
plot_example(random_index,x_train_val,y_train_val)

In [None]:
print("SCALED :")
plot_example(random_index,x_train_val_scaled,y_train_val)

In [None]:
# SPLITTING

from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train_val_scaled,y_train_val, test_size = 0.1, random_state=seed,stratify= y_train_val )

# startify parameter precised to conserve class repartition while splitting.

In [None]:
# Demultiplying data with window stride on whole dataset (by class and variable)

In [None]:
# Choosing an arbitrary window size. It could be bigger or smaller than the initial value 36. Here (for the last use we did) we let the 
# window size unchanged

window_size=36

In [None]:
class_repartitions=class_rep(y_train)
class_repartitions

In [None]:
# define the stride to set for each class, in order to obtain a more or less 
# balanced dataset(underrpz class==>small stride==>demultiply more; overrpz class==>big stride 
# ==> demultiply less).

strides={0: 0,
1: 0,
2: 0,
3: 0,
4: 0,
5: 0,
6: 0,
7: 0,
8: 0,
9: 0,
10: 0,
11: 0}

n_samples_max_class=max(class_repartitions.values())
stride_max_class=window_size/2
n_samples_max_class_new = n_samples_max_class * window_size / stride_max_class

for c in range(12):
    n_samples_c=class_repartitions[labels[c]]
    obj=n_samples_max_class_new
    stride_c = int(n_samples_c * window_size/obj)+1
    while ((window_size % stride_c)!=0):
        stride_c=stride_c+1
    strides[c]=stride_c


    

for k in strides.keys():
    if strides[k]<4:
        strides[k]=4

strides

In [None]:
def build_sequences(df, window, stride):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = []
    labels = []
    # Take only meaningful features
    temp = df[[0,1,2,3,4,5]].values
    # Save the label
    label = df['class'].values[0]
    idx=0
    while idx+window <= len(temp):
        dataset.append(temp[idx:idx+window])
        labels.append(label)
        idx += stride
                
    dataset = np.array(dataset)
    labels = np.array(labels)
    return dataset, labels

In [None]:
# In order to build the new concateneted series properly, we first "flatten" the dataset and get a big concaneted time serie for each feature
# (size 2429*36) keeping the class attached to each point, we store all that info in a dataframe df, which lines conserves the order of the 
# "big" time series points (so we don't "destroy" anything). Then we separate df in multiple datasets by filtering with the class info, so
# that the new datasets concern only one given class, and on those datasets we apply "build_sequences" which finally gives us the new training
# samples.

x_train_flattened, y_train_flattened = flatten(x_train,y_train)
df=pd.DataFrame(x_train_flattened)
df["class"] = y_train_flattened
df["class_name"] = [ labels[y] for y in y_train_flattened]

df_0=df[df["class"]==0]
X_0,Y_0=build_sequences(df_0,window=window_size,stride=strides[0])
X_train,Y_train=X_0,Y_0
print (X_0.shape, " = ? ", (int((len(df_0)-window_size)/strides[0]) + 1 ,window_size,6)," ; ",Y_0.shape)

for c in range(12):
    df_c=df[df["class"]==c]
    X_c, Y_c=build_sequences(df_c,window=window_size,stride=strides[c])
    print (X_c.shape, " = ? ", (int((len(df_c)-window_size)/strides[c]) + 1 ,window_size,6)," ; ",Y_c.shape)
    X_train = np.concatenate((X_train, X_c))
    Y_train = np.concatenate((Y_train, Y_c))

        
X_train= np.array(X_train)
Y_train= np.array(Y_train)

print()
print (X_train.shape," ; ",Y_train.shape)

In [None]:
# We check that the processus went correctly : if it is the case, for the first lines of the new training dataset X_train, feature should 
# appear progressively shifted.

plot_example(0,X_train,Y_train)
plot_example(1,X_train,Y_train)
plot_example(2,X_train,Y_train)
plot_example(3,X_train,Y_train)

In [None]:
class_rep(Y_train)

In [None]:
# Encoding target variable

Y_train_categorical = tfk.utils.to_categorical(Y_train)
y_val_categorical =  tfk.utils.to_categorical(y_val)


print(Y_train_categorical.shape,y_val_categorical.shape  )

In [None]:
# MODEL BUILDING : WE FIRST TRY A "1D VGG"

In [None]:
input_shape = X_train.shape[1:]
classes = Y_train_categorical.shape[-1]
batch_size = 128
epochs = 200

In [None]:
def build_VGG(input_shape):
    vgg = tfk.Sequential()

    vgg.add(tfkl.Conv1D(input_shape=input_shape,filters=64,kernel_size=3,padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=64,kernel_size=3,padding="same", activation="relu"))
    vgg.add(tfkl.MaxPooling1D(pool_size=2,strides=2))
    vgg.add(tfkl.Conv1D(filters=128, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=128, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.MaxPooling1D(pool_size=2,strides=2))
    vgg.add(tfkl.Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=256, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.MaxPooling1D(pool_size=2,strides=2))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.MaxPooling1D(pool_size=2,strides=2))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.Conv1D(filters=512, kernel_size=3, padding="same", activation="relu"))
    vgg.add(tfkl.MaxPooling1D(pool_size=2,strides=2))
    vgg.add(tfkl.GlobalAveragePooling1D())
    vgg.add (tfkl.Dropout(0.4))
    vgg.add (tfkl.Dense(512, activation='relu'))
    vgg.add (tfkl.Dropout(0.4))
    vgg.add (tfkl.Dense(12,activation='softmax'))


    learning_rate = 0.001
    vgg.compile(
        optimizer = tfk.optimizers.Adam(),
        loss = tfk.losses.CategoricalCrossentropy(),
        metrics = ['accuracy' ]
    )
    return vgg

In [None]:
model = build_VGG(input_shape)
model.summary()

In [None]:
# TRAINING PHASE

In [None]:
batch_size=64

# Train the model
history = model.fit(
    x = X_train,
    y = Y_train_categorical,
    batch_size = batch_size,
    epochs = epochs,
    validation_data=(x_val, y_val_categorical),
     callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=30, restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_accuracy', mode='max', patience=20, factor=0.5, min_lr=1e-5)
    ]

    
   
).history

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Confution Matrix and Classification Report
# Y_pred = model_gap.predict_generator(valid_gen,  706// batch_size+1)
Y_pred = model.predict(x_val)
y_pred = np.argmax(Y_pred, axis=1)

cm=confusion_matrix(y_val, y_pred)

disp=ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)

# Compute the classification metrics
accuracy = accuracy_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1))
precision = precision_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
recall = recall_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
f1 = f1_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))



In [None]:
import shutil

model.save('vgg_1D')
shutil.make_archive("vgg_1D", 'zip', './vgg_1D')

In [None]:
# SECOND MODEL : A MORE BASIC ONE

In [None]:
def build_1DCNN_classifier(input_shape, classes):
    # Build the neural network layer by layer
    input_layer = tfkl.Input(shape=input_shape, name='Input')

    # Feature extractor
    cnn = tfkl.Conv1D(256,3,padding='same',activation='relu')(input_layer)
    cnn = tfkl.MaxPooling1D()(cnn)
    cnn = tfkl.Conv1D(256,3,padding='same',activation='relu')(cnn)
    gap = tfkl.GlobalAveragePooling1D()(cnn)
    dropout1 = tfkl.Dropout(0.5, seed=seed)(gap)

    # Classifier
    classifier = tfkl.Dense(128, activation='relu')(dropout1)
    dropout2 = tfkl.Dropout(0.2, seed=seed)(classifier)
    output_layer = tfkl.Dense(classes, activation='softmax')(dropout2)

    # Connect input and output through the Model class
    model = tfk.Model(inputs=input_layer, outputs=output_layer, name='model')

    # Compile the model
    model.compile(loss=tfk.losses.CategoricalCrossentropy(), optimizer=tfk.optimizers.Adam(), metrics='accuracy')

    # Return the model
    return model

In [None]:
model2 = build_1DCNN_classifier(input_shape,classes)
model2.summary()

In [None]:
batch_size=64

history2 = model2.fit(
    x = X_train,
    y = Y_train_categorical,
    batch_size = batch_size,
    epochs = epochs,
    validation_data=(x_val2, y_val_categorical),
     callbacks = [
        tfk.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=50, restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_accuracy', mode='max', patience=30, factor=0.5, min_lr=1e-5)
    ]

    
   
).history

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Confution Matrix and Classification Report

Y_pred = model2.predict(X_val)
y_pred = np.argmax(Y_pred, axis=1)

cm=confusion_matrix(Y_val, y_pred)

disp=ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)

# Compute the classification metrics
accuracy = accuracy_score(np.argmax(Y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1))
precision = precision_score(np.argmax(Y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
recall = recall_score(np.argmax(Y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
f1 = f1_score(np.argmax(Y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))



In [None]:
import shutil 

model2.save('conv_1D_72_samples')
shutil.make_archive('conv_1D_72_samples', 'zip', './conv_1D_72_samples')

In [None]:
# Methods to complete x_val/x_test samples if the window_size is different from 36. As we tried only 72 as other different size (it was at the 
# very end of the project), here the methods works only if we want to reshape x_val/x_test samples to (72,6) but they could have been easily 
# parametrized to any given size (by adding given size - 36 values each time), which we didn't do because it was at the very end of the challenge.


# Completing a sample to a given size by recursively adding the mean value (return only the part to add)
def padding(array):
    sh=array.shape
    M=[]
    
    for feature in range(sh[1]):
        to_moy=[]
        for i in range(sh[0]):
            to_moy.append(array[i,feature])
        M.append(np.mean(to_moy))
    to_return=[]
    for i in range(sh[0]):
        to_return.append(M)
        
    return np.array(to_return)
        

# Completing a sample to a given size by "mirroting" it (return only the part to add)
def reverse(array):
    l=array.shape[0]
    to_return=[]
    for i in range(1, l+1):
        to_return.append(array[l-i])
    return np.array(to_return)


array=np.array([[1,2],[3,4]])
print("(2,2)-->(4,2) : ")
print()
print(" transfo0 (padding) : ",np.concatenate((array,padding(array)) ) ) 
print()
print(" transfo1 (duplicate) : ",np.concatenate((array,array)) ) 
print()
print(" transfo2 (warp) : ",np.concatenate((array,reverse(array))))



In [None]:
# CREATING DIFFERENT RESHAPED VALIDATION DATASET WHICH CAN ALL BE USED FOR THE TRAINING OF THE MODELS. THEN WE JUST HAVE
# TO SELECT THE MOST PROMISING METHOD BTW COPYING, REVERSE AND PADDING.

In [None]:

sh=x_val.shape
print(sh, y_val.shape)

x_val1=[]

for i in range(sh[0]):
    x=x_val[i,:,:]
    x_copy=padding(x)
    x_val1.append(np.concatenate((x,x_copy)))
x_val1=np.array(x_val1)
x_val1.shape

In [None]:
sh=x_val.shape
print(sh, y_val.shape)

x_val2=[]

for i in range(sh[0]):
    x=x_val[i,:,:]
    x_copy=x
    x_val2.append(np.concatenate((x,x_copy)))
x_val2=np.array(x_val2)
x_val2.shape
    



In [None]:
print(x_val2[0][0], x_val2[0][36])

In [None]:
sh=x_val.shape
print(sh, y_val.shape)

x_val3=[]

for i in range(sh[0]):
    x=x_val[i,:,:]
    x_copy=reverse(x)
    x_val3.append(np.concatenate((x,x_copy)))
x_val3=np.array(x_val3)
x_val3.shape
    

In [None]:
print(x_val3[0][35], x_val3[0][36])

In [None]:
# Testing the model on a given validation dataset (which must be a reshaped one if we chose a window size differs from 36).

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Confution Matrix and Classification Report
# Y_pred = model_gap.predict_generator(valid_gen,  706// batch_size+1)
Y_pred = model2.predict(x_val2)
y_pred = np.argmax(Y_pred, axis=1)

cm=confusion_matrix(y_val, y_pred)

disp=ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)

# Compute the classification metrics
accuracy = accuracy_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1))
precision = precision_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
recall = recall_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
f1 = f1_score(np.argmax(y_val_categorical, axis=-1), np.argmax(Y_pred, axis=-1), average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))

