Importing Required Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import csv
import warnings
import lightgbm as lgb
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters

_RANDOM_SEED = 1

Reading the DataSet

In [2]:
directory_activity = "archive\\activity_data"
directory_hrv = "archive\\hrv_data"


patient_info = pd.read_csv("archive\\patient_info.csv", sep=';')
cpt_II = pd.read_csv("archive\\CPT_II_ConnersContinuousPerformanceTest.csv", sep=';')
patient_info['ID'] = patient_info['ID'].astype(str)
cpt_II['ID'] = cpt_II['ID'].astype(str)
patient_info.set_index('ID', inplace=True)
cpt_II.set_index('ID', inplace=True)

hyperaktiv = patient_info.join(cpt_II, how='left')
hyperaktiv.dropna(subset=['ACC_TIME', 'HRV_TIME'], inplace=True, how='all')

Splitting

In [3]:
temp = hyperaktiv.copy()
y = temp['ADHD']
temp.drop(columns=['ADHD'], inplace=True)
x = temp.copy()

# x = x.values
# y = y.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=_RANDOM_SEED)

Feature Engineering

In [4]:
def read_activity_file(filepath, patient_id):
    data = []
    with open(filepath) as f:
        csv_reader = csv.reader(f, delimiter=";")
        next(csv_reader)
        for line in csv_reader:
            data.append([datetime.strptime(line[0], "%m-%d-%Y %H:%M").timestamp(),
                         int(line[1].split(" ")[0])])
    data = pd.DataFrame(data, columns=["TIME", "ACC"])
    data["ID"] = patient_id
    return data

def read_hrv_file(filepath, patient_id):
    data = []
    with open(filepath) as f:
        csv_reader = csv.reader(f, delimiter=";")
        next(csv_reader)
        for line in csv_reader:
            data.append([datetime.strptime(line[0], "%Y-%m-%d %H:%M:%S.%f").timestamp(),
                         float(line[1].split(" ")[0])])

    data = pd.DataFrame(data, columns=["TIME", "HRV"])
    data["ID"] = patient_id
    return data

def create_feature_ACC(row):
    patient_id = int(row.name)
    print("activity number ", patient_id)
    filepath = f"archive\\activity_data\\patient_activity_{patient_id:02d}.csv"

    if not os.path.exists(filepath):
        filepath = "archive\\activity_data\\patient_activity_26.csv"
        activity_data = read_activity_file(filepath, patient_id)
        print(f"patient_activity_{patient_id}.csv is missing")
        features = extract_features(activity_data, column_id="ID", column_value="ACC", column_sort="TIME",
                                n_jobs=0, show_warnings=False,
                                default_fc_parameters=EfficientFCParameters())
        # print(f"inserted NaN instead of patient_activity_{patient_id}.csv")
        features[:] = np.nan
        return features

    activity_data = read_activity_file(filepath, patient_id)
    features = extract_features(activity_data, column_id="ID", column_value="ACC", column_sort="TIME",
                                n_jobs=0, show_warnings=False,
                                default_fc_parameters=EfficientFCParameters())
    # features.to_csv(feature_filepath, index=False, sep=";")
    return features

def create_feature_HRV(row):
    patient_id = int(row.name)
    print("hrv number ", patient_id)
    filepath = f"archive\\hrv_data\\patient_hr_{patient_id}.csv"

    if not os.path.exists(filepath):
        filepath = "archive\\hrv_data\\patient_hr_24.csv"
        hrv_data = read_hrv_file(filepath, patient_id)

        print(f"patient_hr_{patient_id}.csv is missing")
        features = extract_features(hrv_data, column_id="ID", column_value="HRV", column_sort="TIME",
                                n_jobs=0, show_warnings=False,
                                default_fc_parameters=EfficientFCParameters())
        # print(f"inserted NaN instead of patient_hr_{patient_id}.csv")

        features[:] = np.nan
        return features

    hrv_data = read_hrv_file(filepath, patient_id)
    features = extract_features(hrv_data, column_id="ID", column_value="HRV", column_sort="TIME",
                                n_jobs=0, show_warnings=False,
                                default_fc_parameters=EfficientFCParameters())
    # features.to_csv(feature_filepath, index=False, sep=";")
    return features

def create_feature(row):
    features_ACC = create_feature_ACC(row)
    features_HRV = create_feature_HRV(row)

    features = pd.concat([features_ACC, features_HRV], axis=1)
    # features = features_ACC
    for feature_name in features.columns:
        row[feature_name] = features[feature_name].values[0]
    return row



directory_train = f"archive_edited\\seed_{_RANDOM_SEED}\\x_train"
directory_test = f"archive_edited\\seed_{_RANDOM_SEED}\\x_test"

if not (os.path.exists(directory_train) and os.path.exists(directory_test)):
    # if not os.path.exists(directory_train):
    #   os.makedirs(directory_train)

    # if not os.path.exists(directory_test):
    #   os.makedirs(directory_test)

    x_train = x_train.apply(create_feature, axis=1)
    x_test = x_test.apply(create_feature, axis=1)

    x_train.to_csv(os.path.join(directory_train, "x_train.csv"))
    x_test.to_csv(os.path.join(directory_test, "x_test.csv"))
else:
    x_train = pd.read_csv(os.path.join(directory_train, "x_train.csv"))
    x_test  = pd.read_csv(os.path.join(directory_test, "x_test.csv"))

PreProcessing

In [5]:
features_to_drop = ["ACC__friedrich_coefficients__coeff_0__m_3__r_30",
                    "ACC__friedrich_coefficients__coeff_1__m_3__r_30",
                    "ACC__friedrich_coefficients__coeff_2__m_3__r_30",
                    "ACC__friedrich_coefficients__coeff_3__m_3__r_30",
                    "ACC__max_langevin_fixed_point__m_3__r_30",
                    "ACC__query_similarity_count__query_None__threshold_0.0",
                    "HRV__friedrich_coefficients__coeff_0__m_3__r_30",
                    "HRV__friedrich_coefficients__coeff_1__m_3__r_30",
                    "HRV__friedrich_coefficients__coeff_2__m_3__r_30",
                    "HRV__friedrich_coefficients__coeff_3__m_3__r_30",
                    "HRV__max_langevin_fixed_point__m_3__r_30",
                    "HRV__query_similarity_count__query_None__threshold_0.0",
                    'ACC_TIME',
                    'HRV_TIME',
                    'CPT_II', ]
x_train.drop(columns=features_to_drop, inplace=True)
x_test.drop(columns=features_to_drop, inplace=True)
prep_pipeLine = Pipeline([
    ("imputing", KNNImputer(n_neighbors=40)),
    ("feature_selecting", SelectKBest(k=5)),
    ("scaling", MinMaxScaler())
])

Results

In [6]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=_RANDOM_SEED)
for weight in [1.22]:

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    roc_scores = []

    for train_index, val_index in skf.split(x_train, y_train):
        X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        weights = np.where(y_train_fold == 0, 1.0, weight)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            warnings.simplefilter("ignore", category=RuntimeWarning)
            prep_pipeLine.fit(X_train_fold, y_train_fold)
        X_train_fold = prep_pipeLine.transform(X_train_fold)
        X_val_fold = prep_pipeLine.transform(X_val_fold)
        x_test_fold = prep_pipeLine.transform(x_test)

        train_data = lgb.Dataset(X_train_fold, label=y_train_fold, weight=weights)
        val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)
        test_data = lgb.Dataset(x_test_fold, label=y_test, reference=train_data)

        params = {
                    'verbose': -3,
                    'max_bin': 4096,
                    'extra_trees': True,
                    'min_data_in_leaf': 6,
                    'feature_fraction': 0.6,
                    'learning_rate': 0.03,
                    'num_leaves': 128,
                 }
        num_round = 100
        lgb_model = lgb.train(params, train_data, num_boost_round=num_round)

        y_pred_proba = lgb_model.predict(x_test_fold)
        y_pred = [1 if p > 0.5 else 0 for p in y_pred_proba]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        roc_auc = auc(fpr, tpr)

        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        roc_scores.append(roc_auc)

    average_accuracy = np.mean(accuracy_scores)
    average_precision = np.mean(precision_scores)
    average_recall = np.mean(recall_scores)
    average_f1 = np.mean(f1_scores)
    average_roc = np.mean(roc_scores)

    print(f'Average Accuracy across 10 folds for weight:{weight}: {average_accuracy:.8f}')
    print(f'Average Precision across 10 folds for weight:{weight}: {average_precision:.8f}')
    print(f'Average Recall across 10 folds for weight:{weight}: {average_recall:.8f}')
    print(f'Average F1-score across 10 folds for weight:{weight}: {average_f1:.8f}')
    print(f'Average ROC-score across 10 folds for weight:{weight}: {average_roc:.8f}')
    print("###########################################################################################\n")

Average Accuracy across 10 folds for weight:1.22: 0.94000000
Average Precision across 10 folds for weight:1.22: 0.90000000
Average Recall across 10 folds for weight:1.22: 0.99000000
Average F1-score across 10 folds for weight:1.22: 0.94285714
Average ROC-score across 10 folds for weight:1.22: 0.94000000
###########################################################################################

