In [1]:
from os.path import join

from pandas import read_csv

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from sklearn.metrics import f1_score

In [2]:
from numpy import zeros, unique

data_dir = "/home/benoit/data/dreem_challenge/"
labels_path = "/home/benoit/data/dreem_challenge/y_train.csv"

labels = read_csv(labels_path)

In [3]:
import h5py
from numpy import unique

d = h5py.File(join(data_dir, 'X_train', 'X_train.h5'), 'r')

subject = d['index'][()]

In [None]:
from tools.processing import load_features, merge_features

features_1 = load_features(join(data_dir, "computed", 'base_1_train.csv'))
features_2 = load_features(join(data_dir, "computed", 'base_2_train.csv'))
features_3 = load_features(join(data_dir, "computed", 'base_3_train.csv'))
features_4 = load_features(join(data_dir, "computed", 'base_4_train.csv'))
features_5 = load_features(join(data_dir, "computed", 'base_5_train.csv'))
features_6 = load_features(join(data_dir, "computed", 'base_6_train.csv'))
features_7 = load_features(join(data_dir, "computed", 'base_7_train.csv'))
features_8 = load_features(join(data_dir, "computed", 'base_8_train.csv'))
features_9 = load_features(join(data_dir, "computed", 'base_9_train.csv'))

f_train = merge_features([features_1, features_2, features_3, features_4, features_5, features_6, features_7])
y = labels['sleep_stage']
f_train = f_train.drop(['abs_index'], axis=1) 

In [5]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(f_train, y, test_size = 0.3)

# Features preprocessing (noramlization etc...)

scaler = StandardScaler().fit(f_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
from numpy import array, invert

features_names = array(f_train.keys())

In [7]:
# Feature Selection

X_train_bis = X_train

# Removing features with low variance
from sklearn.feature_selection import VarianceThreshold

v = 0

slct_low_variance = VarianceThreshold(threshold=v)
X_train_bis = slct_low_variance.fit_transform(X_train_bis)

low_var = slct_low_variance.get_support()

print(features_names[invert(low_var)])
features_names = features_names[low_var]

['eeg_1_band_gamma' 'eeg_2_band_gamma' 'eeg_3_band_gamma'
 'eeg_4_band_gamma' 'eeg_5_band_gamma' 'eeg_6_band_gamma'
 'eeg_7_band_gamma']


In [8]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer

scorer_f1 = make_scorer(f1_score, average="weighted", verbose=1)

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False)

rec_slct = RFECV(lsvc, scoring = scorer_f1, verbose=2)

X_train_bis = rec_slct.fit_transform(X_train_bis, y_train)

l1_rec = rec_slct.get_support()

print(features_names[l1_rec])

In [19]:
X_train_bis = rec_slct.transform(X_train_bis)

In [9]:
X_train = X_train_bis
X_test = slct_low_variance.transform(X_test)
X_test = rec_slct.transform(X_test)

In [34]:
# Model initialization

from sklearn.ensemble import AdaBoostClassifier

# clf = RandomForestClassifier(n_estimators=100, 
#                             criterion='gini', 
#                             max_depth=None, 
#                             min_samples_leaf=0.0001, 
#                             class_weight="balanced",
#                             bootstrap = False)

from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1)

# clf = KNeighborsClassifier(n_neighbors = 15)

# clf= SVC(kernel = 'linear', C = 0.01, class_weight=None)

# clf = GaussianNB()

In [35]:
# Hyperparameter tuning (k-fold etc...)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

scorer_f1 = make_scorer(f1_score, average="weighted")

param_grid_svm = [
  {"C": [0.001, 0.01, 0.1], "kernel":["linear", "poly", "rbf", "sigmoid"], "class_weight":[None, "balanced"]}
 ]

tuner = GridSearchCV(clf, param_grid, scorer_f1,  verbose=2)

res = tuner.fit(X_train, y_train)
print(res)

In [31]:
res.best_params_
clf = res.best_estimator_

In [None]:
# Model training

clf.fit(X_train, y_train)

In [37]:
# Model verifications

predictions_train = clf.predict(X_train)
print(f"Training score - {f1_score(predictions_train, y_train, average='weighted')}")

predictions_test = clf.predict(X_test)
print(f"Testing score - {f1_score(predictions_test, y_test, average='weighted')}")

Training score - 0.7722010076447845
Testing score - 0.6712892631167978


In [None]:
# Analyse des résultats 
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
from matplotlib.pyplot import show

cm = confusion_matrix(y_test, predictions_test, normalize="true")
ax = heatmap(cm, annot=True)
show()

In [51]:
from collections import Counter

print(Counter(y_test))
print(Counter(predictions_test))

Counter({2: 2833, 3: 1552, 4: 1481, 0: 1062, 1: 479})
Counter({2: 3286, 3: 1540, 4: 1325, 0: 1137, 1: 119})


In [52]:
# Loading evaluation data

f_1_e = load_features(join(data_dir, "computed", 'base_1_eval.csv'))
f_2_e = load_features(join(data_dir, "computed", 'base_2_eval.csv'))
f_3_e = load_features(join(data_dir, "computed", 'base_3_eval.csv'))
f_4_e = load_features(join(data_dir, "computed", 'base_4_eval.csv'))
f_5_e = load_features(join(data_dir, "computed", 'base_5_eval.csv'))
f_6_e = load_features(join(data_dir, "computed", 'base_6_eval.csv'))
f_7_e = load_features(join(data_dir, "computed", 'base_7_eval.csv'))
f_8_e = load_features(join(data_dir, "computed", 'base_8_eval.csv'))
f_9_e = load_features(join(data_dir, "computed", 'base_9_eval.csv'))

f_eval = merge_features([f_1_e, f_2_e, f_3_e, f_4_e, f_5_e, f_6_e, f_7_e])

f_eval = f_eval.drop(['abs_index'], axis=1)

# Features preprocessing & Feature selection (both must be applied as for training features)

X_eval = scaler.transform(f_eval)

X_eval = slct_low_variance.transform(X_eval)
X_eval = rec_slct.transform(X_eval)

In [53]:
# Making prediction on eval data

eval_pred = clf.predict(X_eval)

sub = read_csv(join(data_dir, 'sample_submission.csv'))
sub['sleep_stage'] = eval_pred

sub.to_csv("output/submission.csv", index=False)