In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold
from numpy.random import RandomState
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv('/Users/andrei-macpro/Documents/Data/tracking/features/play/combined_features.csv', index_col=0)

In [None]:

#remove columns Age, DAI, Rinab, IQ_T2, duration_meal, duration_play, Gender from df
df = df.drop(columns=['Age', 'DAI', 'Rinab', 'IQ_T2', 'duration_meal', 'duration_play','Gender'])
df


## Classification without group but instead by averaging the samples from the same participant



In [None]:
df['label'] = df['label'].map({'no_rad': 0, 'rad': 1})

In [None]:
df

In [None]:

# Reset the index
df = df.reset_index()

# Create the 'group' column and group by it
df['group'] = df['s_id'].str.split('_').str[0]
df_grouped = df.groupby('group').mean()

# Set the index back to 's_id'
# change index name to s_id
df.index.name = 's_id'

In [None]:
df = df.groupby('group').mean()

In [None]:
df

In [None]:
df['label'] = df['label'].map({0: 'no_rad', 1: 'rad'})

In [None]:
df.index.name = 's_id'

In [None]:
# Initialize a list to store the results

# Perform a grid search for each classifier
X = df.drop('label', axis=1)
y = df['label']

# Create a GroupKFold object
gkf = KFold(n_splits=5)
# Define the classifiers and their parameters
classifiers = [
    ('dummy', DummyClassifier(strategy='most_frequent'), {}),
('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]

results = []

# Perform the grid search 10 times with different random states
for rs in range(10):
    # Create a KFold object with the current random state
    gkf = KFold(n_splits=5, shuffle=True, random_state=rs)

    # Perform a grid search for each classifier
    for name, classifier, params in classifiers:
        # Set the random state of the classifier, if it supports it
        if 'random_state' in classifier.get_params():
            classifier.set_params(random_state=rs)

        pipeline = Pipeline([('scaler', StandardScaler()), (name, classifier)])
        grid_search = GridSearchCV(pipeline, params, cv=gkf)
        grid_search.fit(X, y)

        # Calculate the cross-validated F1 score, precision, and recall
        f1_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=gkf, scoring='f1_macro')
        precision_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=gkf, scoring='precision_macro')
        recall_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=gkf, scoring='recall_macro')

        # Store the results in the list
        results.append({
            'random_state': rs,
            'classifier': name,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_,
            'f1_score': f1_scores.mean(),
            'precision': precision_scores.mean(),
            'recall': recall_scores.mean()
        })

# Convert the results to a DataFrame and save it to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('results_play.csv', index=False)

In [None]:
results_df.groupby('classifier').mean()

In [None]:
## end of this part from here it's doing it with group column 

In [None]:
df.reset_index(inplace=True)

# Create a group column by extracting the numeric part of the index
df['group'] = df['s_id'].str.extract('(\d+)').astype(int)

# Set the index back to s_id
df.set_index('s_id', inplace=True)

In [None]:
# Perform a grid search for each classifier
X = df.drop('label', axis=1)
y = df['label']
groups = df['group']
# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=0)
# Define the classifiers and their parameters
classifiers = [
('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]

# Perform a grid search for each classifier
for name, classifier, params in classifiers:
    pipeline = Pipeline([('scaler', StandardScaler()), (name, classifier)])
    grid_search = GridSearchCV(pipeline, params, cv=gkf)
    grid_search.fit(X_shuffled, y_shuffled, groups=groups_shuffled)

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best score for {name}: {grid_search.best_score_}')

    # Calculate the cross-validated F1 score, precision, and recall
    f1_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='f1_macro', groups=groups_shuffled)
    precision_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='precision_macro', groups=groups_shuffled)
    recall_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='recall_macro', groups=groups_shuffled)

    print(f'Cross-validated F1 score for {name}: {f1_scores.mean()}')
    print(f'Cross-validated precision for {name}: {precision_scores.mean()}')
    print(f'Cross-validated recall for {name}: {recall_scores.mean()}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
X = df.drop(['label','group'], axis=1)
y = df['label']
groups = df['group']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression()

# Initialize the selector
# Fit the model
model.fit(X_train, y_train)

# Get the coefficients of the model
coefficients = model.coef_[0]

# Get the absolute values of the coefficients
abs_coefficients = abs(coefficients)

# Create a DataFrame with the feature names and coefficients
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': abs_coefficients})

# Sort the DataFrame by the coefficients in descending order
feature_importances = feature_importances.sort_values('importance', ascending=False)

# Print the features in order of their importance
print(feature_importances)

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold
from numpy.random import RandomState

# Create a dummy classifier
dummy = DummyClassifier(strategy='stratified')

# Create an SVC classifier with specific parameters
svc_rbf = SVC(kernel='rbf', C=100, gamma=0.1)

# Initialize a RandomState
rs = RandomState(123)

# Initialize a GroupKFold
gkf = GroupKFold(n_splits=5)

# Perform 10 random shuffles
for i in range(10):
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=rs)

    # Calculate the cross-validated accuracy of the dummy classifier
    accuracy_dummy = cross_val_score(dummy, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring='accuracy').mean()

    print(f'Cross-validated accuracy of dummy classifier for shuffle {i+1}: {accuracy_dummy}')

    # Calculate the cross-validated accuracy of the SVC classifier
    accuracy_svc = cross_val_score(svc_rbf, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring='accuracy').mean()

    print(f'Cross-validated accuracy of SVC classifier for shuffle {i+1}: {accuracy_svc}')

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from sklearn.model_selection import GroupKFold, cross_validate
from numpy.random import RandomState

# Create a dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Create an SVC classifier with specific parameters
svc_rbf = SVC(kernel='rbf', C=1, gamma=0.1)

# Initialize a RandomState
rs = RandomState(123)

# Initialize a GroupKFold
gkf = GroupKFold(n_splits=5)

# Define scoring metrics
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average='macro'),
           'recall': make_scorer(recall_score, average='macro'),
           'f1': make_scorer(f1_score, average='macro')}

# Perform 10 random shuffles
for i in range(10):
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=rs)

    # Calculate the cross-validated scores of the dummy classifier
    scores_dummy = cross_validate(dummy, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of dummy classifier for shuffle {i+1}:')
    for metric, scores in scores_dummy.items():
        print(f'{metric}: {scores.mean()}')

    # Calculate the cross-validated scores of the SVC classifier
    scores_svc = cross_validate(svc_rbf, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of SVC classifier for shuffle {i+1}:')
    for metric, scores in scores_svc.items():
        print(f'{metric}: {scores.mean()}')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Create an SVC classifier with specific parameters
svc_rbf = SVC(kernel='rbf', C=1, gamma=0.1)

# Initialize a RandomState
rs = RandomState(123)

# Initialize a GroupKFold
gkf = GroupKFold(n_splits=5)

# Define scoring metrics
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average='macro'),
           'recall': make_scorer(recall_score, average='macro'),
           'f1': make_scorer(f1_score, average='macro')}

# Create pipelines
pipeline_dummy = Pipeline([('scaler', StandardScaler()), ('dummy', dummy)])
pipeline_svc = Pipeline([('scaler', StandardScaler()), ('svc', svc_rbf)])

# Perform 10 random shuffles
for i in range(10):
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=rs)

    # Calculate the cross-validated scores of the dummy classifier
    scores_dummy = cross_validate(pipeline_dummy, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of dummy classifier for shuffle {i+1}:')
    for metric, scores in scores_dummy.items():
        print(f'{metric}: {scores.mean()}')

    # Calculate the cross-validated scores of the SVC classifier
    scores_svc = cross_validate(pipeline_svc, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of SVC classifier for shuffle {i+1}:')
    for metric, scores in scores_svc.items():
        print(f'{metric}: {scores.mean()}')