In [74]:
! pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Load data

In [75]:
import pandas as pd
import numpy as np
BASE_DIR = "narpsdata/extended"
common_cols = [0, 1, 2]

def load_data(filename):
    df = pd.read_csv(filename, delimiter=" ", header=None)
    return df

def merge_and_fillna(reference_df, target_df):
    merged_df = pd.merge(reference_df, target_df, on=common_cols, how='left')
    merged_df = merged_df.fillna(0)
    return merged_df

def clean_data(gain_range, loss_range, gain_indifference, loss_indifference):
    ind_xyz = set(zip(gain_indifference[0].tolist(), gain_indifference[1].tolist(), gain_indifference[2].tolist()))
    rng_xyz = set(zip(gain_range[0].tolist(), gain_range[1].tolist(), gain_range[2].tolist()))
    xyz = ind_xyz | rng_xyz
    all_coords = pd.DataFrame(list(xyz), columns=common_cols)
    
    new_gain_range = merge_and_fillna(all_coords, gain_range)
    new_loss_range = merge_and_fillna(all_coords, loss_range)
    new_gain_indifference = merge_and_fillna(all_coords, gain_indifference)
    new_loss_indifference = merge_and_fillna(all_coords, loss_indifference)
    return new_gain_range, new_loss_range, new_gain_indifference, new_loss_indifference

def reset_data_frame(df):
    df = df.transpose() # row = subject
    df = df.drop(common_cols, axis='index') # drop rows with x,y,z
    df = df.reset_index(drop=True)
    return df

def load_all_data():
    gain_range = load_data(f"{BASE_DIR}/gain_range_vmpfc.csv")
    loss_range = load_data(f"{BASE_DIR}/loss_range_vmpfc.csv")
    gain_indifference = load_data(f"{BASE_DIR}/gain_ind_vmpfc.csv")
    loss_indifference = load_data(f"{BASE_DIR}/loss_ind_vmpfc.csv")
    
    gain_range, loss_range, gain_indifference, loss_indifference = clean_data(gain_range, loss_range, gain_indifference, loss_indifference)
    for d in [gain_range, loss_range, gain_indifference, loss_indifference]:
        reset_data_frame(d)

    gain_range["gain_or_loss"] = 1
    loss_range["gain_or_loss"] = 0
    gain_indifference["gain_or_loss"] = 1
    loss_indifference["gain_or_loss"] = 0
    
    df = pd.concat([gain_range, loss_range, gain_indifference, loss_indifference]).reset_index(drop=True)
    X = df.drop("gain_or_loss", axis='columns').fillna(0).to_numpy()
    y = df["gain_or_loss"].to_numpy()
    return X, y


In [76]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [77]:
X, y = load_all_data()

In [78]:
X[3777]

array([ 5.100000e+01,  8.200000e+01,  2.800000e+01,  5.286700e-01,
       -6.887000e-01,  8.765780e-01,  4.309100e-01, -5.748650e-01,
       -4.175370e-01,  5.740410e-01, -4.542860e-01,  4.550730e-01,
        1.465250e-01, -5.445900e-02, -1.782050e-01, -5.623060e-01,
       -8.397970e-01,  2.997080e-01,  3.142230e-01,  4.403300e-02,
       -9.304300e-02,  1.362660e-01,  9.473000e-03,  2.489510e-01,
       -4.098000e-01, -5.182590e-01, -6.476260e-01, -3.244400e-01,
        6.271720e-01, -5.325970e-01, -8.117370e-01, -2.345210e-01,
       -2.776790e-01,  5.519120e-01, -3.958970e-01, -2.549020e-01,
       -5.483600e-01,  1.361590e-01,  4.145630e-01, -2.167000e-03,
        3.812090e-01, -7.588290e-01, -5.588510e-01,  9.834300e-02,
        1.024916e+00, -9.912600e-02, -6.494320e-01,  6.697200e-02,
       -1.081290e-01,  2.864800e-01,  2.474340e-01,  8.025980e-01,
        0.000000e+00,  0.000000e+00,  0.000000e+00])

In [79]:
X.shape

(15092, 55)

In [84]:
from sklearn.model_selection import cross_val_score

def crossval(estimator, X=X, y=y, k=10, scoring=None):
    print(f"Performing {k}-fold crossvalidation of {estimator}")
    cv_score = cross_val_score(estimator, X, y, cv=k, scoring=scoring)
    print(f"Scores: {cv_score}")
    print(f"Average: {np.mean(cv_score)}. Stddev: {np.std(cv_score)}")
    return cv_score


## Linear

In [85]:
crossval(SVC(kernel='linear', C=1.0, random_state=42))

Performing 10-fold crossvalidation of SVC(kernel='linear', random_state=42)
Scores: [0.78013245 0.77019868 0.77070908 0.78528827 0.77733598 0.81643472
 0.82239894 0.79787939 0.81444665 0.83631544]
Average: 0.7971139608266515. Stddev: 0.022564183155926915


array([0.78013245, 0.77019868, 0.77070908, 0.78528827, 0.77733598,
       0.81643472, 0.82239894, 0.79787939, 0.81444665, 0.83631544])

In [86]:
crossval(SVC(kernel='linear', C=0.001, random_state=42))

Performing 10-fold crossvalidation of SVC(C=0.001, kernel='linear', random_state=42)
Scores: [0.77218543 0.77417219 0.77402253 0.78793903 0.77402253 0.82637508
 0.81577203 0.80251822 0.81974818 0.83432737]
Average: 0.7981082599326778. Stddev: 0.02327910321581979


array([0.77218543, 0.77417219, 0.77402253, 0.78793903, 0.77402253,
       0.82637508, 0.81577203, 0.80251822, 0.81974818, 0.83432737])

In [87]:
crossval(SVC(kernel='linear', C=100, random_state=42))

Performing 10-fold crossvalidation of SVC(C=100, kernel='linear', random_state=42)
Scores: [0.78476821 0.77218543 0.77335984 0.78727634 0.77998675 0.81908549
 0.82107356 0.7972167  0.81047051 0.83167661]
Average: 0.7977099434299282. Stddev: 0.020382518001826614


array([0.78476821, 0.77218543, 0.77335984, 0.78727634, 0.77998675,
       0.81908549, 0.82107356, 0.7972167 , 0.81047051, 0.83167661])

## RBF

In [88]:
crossval(SVC(kernel='rbf', C=1.0, random_state=42))

Performing 10-fold crossvalidation of SVC(random_state=42)
Scores: [0.75960265 0.75827815 0.76275679 0.77799867 0.76474486 0.82372432
 0.82372432 0.80980782 0.82173625 0.83830351]
Average: 0.7940677348711265. Stddev: 0.030489808980850016


array([0.75960265, 0.75827815, 0.76275679, 0.77799867, 0.76474486,
       0.82372432, 0.82372432, 0.80980782, 0.82173625, 0.83830351])

In [89]:
crossval(SVC(kernel='rbf', C=0.001, random_state=42))

Performing 10-fold crossvalidation of SVC(C=0.001, random_state=42)
Scores: [0.7384106  0.7192053  0.49966865 0.49966865 0.49966865 0.49966865
 0.49966865 0.49966865 0.49966865 0.49966865]
Average: 0.5454965131945634. Stddev: 0.0917562674500282


array([0.7384106 , 0.7192053 , 0.49966865, 0.49966865, 0.49966865,
       0.49966865, 0.49966865, 0.49966865, 0.49966865, 0.49966865])

In [90]:
crossval(SVC(kernel='rbf', C=100, random_state=42))

Performing 10-fold crossvalidation of SVC(C=100, random_state=42)
Scores: [0.78940397 0.77748344 0.78197482 0.78926441 0.78263751 0.82173625
 0.82703777 0.80516899 0.81643472 0.83830351]
Average: 0.802944540263935. Stddev: 0.020595308717217174


array([0.78940397, 0.77748344, 0.78197482, 0.78926441, 0.78263751,
       0.82173625, 0.82703777, 0.80516899, 0.81643472, 0.83830351])

## Random Forest

In [91]:
from sklearn.ensemble import RandomForestClassifier

crossval(RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42))


Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=3, random_state=42)
Scores: [0.70993377 0.69072848 0.70311465 0.71040424 0.68853545 0.78860172
 0.77932406 0.77269715 0.77203446 0.78131213]
Average: 0.7396686108514475. Stddev: 0.03990753247111766


array([0.70993377, 0.69072848, 0.70311465, 0.71040424, 0.68853545,
       0.78860172, 0.77932406, 0.77269715, 0.77203446, 0.78131213])

In [93]:
crossval(RandomForestClassifier(n_estimators=200, max_depth=4, random_state=42))


Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=4, n_estimators=200, random_state=42)
Scores: [0.72781457 0.7013245  0.71371769 0.71901922 0.69648774 0.80185553
 0.79191518 0.78528827 0.78396289 0.79456594]
Average: 0.7515951531429526. Stddev: 0.04099581647568119


array([0.72781457, 0.7013245 , 0.71371769, 0.71901922, 0.69648774,
       0.80185553, 0.79191518, 0.78528827, 0.78396289, 0.79456594])

## Neural Network

In [92]:
from sklearn.neural_network import MLPClassifier
crossval(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(50, 20), random_state=42, verbose=False))


Performing 10-fold crossvalidation of MLPClassifier(alpha=1e-05, hidden_layer_sizes=(50, 20), random_state=42,
              solver='lbfgs')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Scores: [0.78211921 0.77086093 0.77269715 0.78396289 0.77667329 0.8250497
 0.82637508 0.81047051 0.81047051 0.83366468]
Average: 0.7992343949547748. Stddev: 0.02320651402238716


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


array([0.78211921, 0.77086093, 0.77269715, 0.78396289, 0.77667329,
       0.8250497 , 0.82637508, 0.81047051, 0.81047051, 0.83366468])