In [None]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [None]:
df = pd.read_csv('/Users/andrei-macpro/Documents/Data/tracking/features/play/combined_features.csv', index_col=0)

In [None]:

#remove columns Age, DAI, Rinab, IQ_T2, duration_meal, duration_play, Gender from df
df = df.drop(columns=['Age', 'DAI', 'Rinab', 'IQ_T2', 'duration_meal', 'duration_play','Gender'])
df


In [None]:
df.reset_index(inplace=True)

# Create a group column by extracting the numeric part of the index
df['group'] = df['s_id'].str.extract('(\d+)').astype(int)

# Set the index back to s_id
df.set_index('s_id', inplace=True)

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame and 'label' is the target variable
# Assuming 'group' is the column in your DataFrame that defines the groups
X = df.drop('label', axis=1)
y = df['label']
groups = df['group']

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

scores = []

# Split the data into training and testing sets based on the groups
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)

print(f'Average Accuracy: {np.mean(scores)}')

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Assuming df is your DataFrame and 'label' is the target variable
# Assuming 'group' is the column in your DataFrame that defines the groups
X = df.drop('label', axis=1)
y = df['label']
groups = df['group']

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

scores = []

# Split the data into training and testing sets based on the groups
for train_index, test_index in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)

print(f'Average Accuracy: {np.mean(scores)}')

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature extraction
model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=10)
fit = rfe.fit(X_scaled, y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score

# Assuming 'group' is the column in your DataFrame that defines the groups
groups = df['group']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature extraction
model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=15)
fit = rfe.fit(X_scaled, y)

# Select only the most important features
X_important = X_scaled[:, fit.support_]

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

scores = []

# Split the data into training and testing sets based on the groups
for train_index, test_index in gkf.split(X_important, y, groups):
    X_train, X_test = X_important[train_index], X_important[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)

print(f'Average Accuracy: {np.mean(scores)}')

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score

# Assuming 'group' is the column in your DataFrame that defines the groups
groups = df['group']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature extraction
model = SVC(kernel='linear')
rfe = RFE(estimator=model, n_features_to_select=10)
fit = rfe.fit(X_scaled, y)

# Select only the most important features
X_important = X_scaled[:, fit.support_]

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

scores = []

# Split the data into training and testing sets based on the groups
for train_index, test_index in gkf.split(X_important, y, groups):
    X_train, X_test = X_important[train_index], X_important[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)

print(f'Average Accuracy: {np.mean(scores)}')

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define the classifiers and their parameters
classifiers = [
    ('lr', LogisticRegression(), {'lr__C': [0.1, 1]}),
    ('svc', SVC(), {'svc__C': [0.1, 1], 'svc__kernel': ['linear']}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50], 'rf__max_depth': [None, 5]})
]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform a grid search for each classifier
for name, classifier, params in classifiers:
    pipeline = Pipeline([(name, classifier)])
    grid_search = GridSearchCV(pipeline, params, cv=3)
    grid_search.fit(X_scaled, y)

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best score for {name}: {grid_search.best_score_}')

In [None]:
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle

# Define the classifiers and their parameters
classifiers = [
    ('lr', LogisticRegression(), {'lr__C': [0.1, 1]}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.1, 1]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.1, 1], 'svc_rbf__gamma': [0.1, 1]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50], 'rf__max_depth': [None, 5]})
]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=0)

# Select the 15 most important features
selector = SelectKBest(f_classif, k=15)
X_selected = selector.fit_transform(X_shuffled, y_shuffled)

# Assuming 'group' is the column in your DataFrame that defines the groups
groups = df['group']

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Perform a grid search for each classifier
for name, classifier, params in classifiers:
    pipeline = Pipeline([(name, classifier)])
    grid_search = GridSearchCV(pipeline, params, cv=gkf)
    grid_search.fit(X_shuffled, y_shuffled, groups=groups_shuffled)

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best score for {name}: {grid_search.best_score_}')

In [None]:
# Assuming 'features' is a list of feature names
features = list(df.columns)

# Select the 15 most important features
selector = SelectKBest(f_classif, k=15)
X_selected = selector.fit_transform(X_shuffled, y_shuffled)

# Get the names of the selected features
selected_features = [feature for feature, mask in zip(features, selector.get_support()) if mask]

print('Selected features:', selected_features)

In [None]:
# Fit the SelectKBest method
selector.fit(X_shuffled, y_shuffled)

# Get the scores
scores = selector.scores_

# Create a list of tuples, each containing a feature and its score
feature_scores = list(zip(features, scores))

# Sort the list of tuples by the score, in descending order
sorted_feature_scores = sorted(feature_scores, key=lambda x: x[1], reverse=True)

# Get the names of the features, in order of their score
sorted_features = [feature for feature, score in sorted_feature_scores]

print('Features sorted by importance:', sorted_features)

In [None]:
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score


# Define the classifiers and their parameters
classifiers = [
('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=0)

# Select the 15 most important features
selector = SelectKBest(f_classif, k=15)
X_selected = selector.fit_transform(X_shuffled, y_shuffled)

# Assuming 'group' is the column in your DataFrame that defines the groups
groups = df['group']

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Perform a grid search for each classifier
for name, classifier, params in classifiers:
    pipeline = Pipeline([(name, classifier)])
    grid_search = GridSearchCV(pipeline, params, cv=gkf)
    grid_search.fit(X_shuffled, y_shuffled, groups=groups_shuffled)

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best score for {name}: {grid_search.best_score_}')

    # Calculate the cross-validated F1 score, precision, and recall
    f1_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='f1_macro', groups=groups_shuffled)
    precision_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='precision_macro', groups=groups_shuffled)
    recall_scores = cross_val_score(grid_search.best_estimator_, X_shuffled, y_shuffled, cv=gkf, scoring='recall_macro', groups=groups_shuffled)

    print(f'Cross-validated F1 score for {name}: {f1_scores.mean()}')
    print(f'Cross-validated precision for {name}: {precision_scores.mean()}')
    print(f'Cross-validated recall for {name}: {recall_scores.mean()}')
    

In [None]:
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score


## this time using just the 15 best features 


# Define the classifiers and their parameters
classifiers = [
    ('lr', LogisticRegression(), {'lr__C': [0.01, 0.1, 1, 10, 100], 'lr__penalty': ['l1', 'l2'], 'lr__solver': ['liblinear', 'saga']}),
    ('svc_linear', SVC(kernel='linear'), {'svc_linear__C': [0.01, 0.1, 1, 10, 100]}),
    ('svc_rbf', SVC(kernel='rbf'), {'svc_rbf__C': [0.01, 0.1, 1, 10, 100], 'svc_rbf__gamma': [0.01, 0.1, 1, 10, 100]}),
    ('rf', RandomForestClassifier(), {'rf__n_estimators': [10, 50, 100, 200], 'rf__max_depth': [None, 5, 10, 15], 'rf__min_samples_split': [2, 5, 10]})
]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=0)

# Select the 15 most important features
selector = SelectKBest(f_classif, k=15)
X_selected = selector.fit_transform(X_shuffled, y_shuffled)

# Assuming 'group' is the column in your DataFrame that defines the groups
groups = df['group']

# Create a GroupKFold object
gkf = GroupKFold(n_splits=5)

# Perform a grid search for each classifier
for name, classifier, params in classifiers:
    pipeline = Pipeline([(name, classifier)])
    grid_search = GridSearchCV(pipeline, params, cv=gkf)
    grid_search.fit(X_selected, y_shuffled, groups=groups_shuffled)

    print(f'Best parameters for {name}: {grid_search.best_params_}')
    print(f'Best score for {name}: {grid_search.best_score_}')

    # Calculate the cross-validated F1 score, precision, and recall
    f1_scores = cross_val_score(grid_search.best_estimator_, X_selected, y_shuffled, cv=gkf, scoring='f1_macro', groups=groups_shuffled)
    precision_scores = cross_val_score(grid_search.best_estimator_, X_selected, y_shuffled, cv=gkf, scoring='precision_macro', groups=groups_shuffled)
    recall_scores = cross_val_score(grid_search.best_estimator_, X_selected, y_shuffled, cv=gkf, scoring='recall_macro', groups=groups_shuffled)

    print(f'Cross-validated F1 score for {name}: {f1_scores.mean()}')
    print(f'Cross-validated precision for {name}: {precision_scores.mean()}')
    print(f'Cross-validated recall for {name}: {recall_scores.mean()}')

In [None]:
df['label'].unique()   
# count how many samples are in each class based on the group column

df['label'].value_counts()

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# Create a dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Fit the dummy classifier
dummy.fit(X_shuffled, y_shuffled)

# Predict the labels of the training set
y_pred_dummy = dummy.predict(X_shuffled)

# Calculate the accuracy of the dummy classifier
accuracy_dummy = accuracy_score(y_shuffled, y_pred_dummy)

print(f'Accuracy of dummy classifier: {accuracy_dummy}')

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold
from numpy.random import RandomState

# Create a dummy classifier
dummy = DummyClassifier(strategy='stratified')

# Create an SVC classifier with specific parameters
svc_rbf = SVC(kernel='rbf', C=100, gamma=0.1)

# Initialize a RandomState
rs = RandomState(123)

# Initialize a GroupKFold
gkf = GroupKFold(n_splits=5)

# Perform 10 random shuffles
for i in range(10):
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=rs)

    # Calculate the cross-validated accuracy of the dummy classifier
    accuracy_dummy = cross_val_score(dummy, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring='accuracy').mean()

    print(f'Cross-validated accuracy of dummy classifier for shuffle {i+1}: {accuracy_dummy}')

    # Calculate the cross-validated accuracy of the SVC classifier
    accuracy_svc = cross_val_score(svc_rbf, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring='accuracy').mean()

    print(f'Cross-validated accuracy of SVC classifier for shuffle {i+1}: {accuracy_svc}')

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from sklearn.model_selection import GroupKFold, cross_validate
from numpy.random import RandomState

# Create a dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Create an SVC classifier with specific parameters
svc_rbf = SVC(kernel='rbf', C=100, gamma=0.1)

# Initialize a RandomState
rs = RandomState(123)

# Initialize a GroupKFold
gkf = GroupKFold(n_splits=5)

# Define scoring metrics
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average='macro'),
           'recall': make_scorer(recall_score, average='macro'),
           'f1': make_scorer(f1_score, average='macro')}

# Perform 10 random shuffles
for i in range(10):
    X_shuffled, y_shuffled, groups_shuffled = shuffle(X_scaled, y, groups, random_state=rs)

    # Calculate the cross-validated scores of the dummy classifier
    scores_dummy = cross_validate(dummy, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of dummy classifier for shuffle {i+1}:')
    for metric, scores in scores_dummy.items():
        print(f'{metric}: {scores.mean()}')

    # Calculate the cross-validated scores of the SVC classifier
    scores_svc = cross_validate(svc_rbf, X_shuffled, y_shuffled, groups=groups_shuffled, cv=gkf, scoring=scoring)

    print(f'Cross-validated scores of SVC classifier for shuffle {i+1}:')
    for metric, scores in scores_svc.items():
        print(f'{metric}: {scores.mean()}')