In [1]:
! pip install pandas numpy scikit-learn

import pandas as pd
import numpy as np



## Load data

In [20]:
import pandas as pd
import numpy as np
BASE_DIR = "narpsdata/extended"
common_cols = [0, 1, 2]

def load_data(filename):
    df = pd.read_csv(filename, delimiter=" ", header=None)
    return df

def merge_and_fillna(reference_df, target_df):
    merged_df = pd.merge(reference_df, target_df, on=common_cols, how='left')
    merged_df = merged_df.fillna(0)
    return merged_df

def clean_data(gain_range, loss_range, gain_indifference, loss_indifference):
    ind_xyz = set(zip(gain_indifference[0].tolist(), gain_indifference[1].tolist(), gain_indifference[2].tolist()))
    rng_xyz = set(zip(gain_range[0].tolist(), gain_range[1].tolist(), gain_range[2].tolist()))
    xyz = ind_xyz | rng_xyz
    all_coords = pd.DataFrame(list(xyz), columns=common_cols)
    
    new_gain_range = merge_and_fillna(all_coords, gain_range)
    new_loss_range = merge_and_fillna(all_coords, loss_range)
    new_gain_indifference = merge_and_fillna(all_coords, gain_indifference)
    new_loss_indifference = merge_and_fillna(all_coords, loss_indifference)
    return new_gain_range, new_loss_range, new_gain_indifference, new_loss_indifference

def reset_data_frame(df):
    df = df.transpose() # row = subject
    df = df.drop(common_cols, axis='index') # drop rows with x,y,z
    df = df.reset_index(drop=True)
    return df

def load_all_data():
    gain_range = load_data(f"{BASE_DIR}/gain_range_vmpfc.csv")
    loss_range = load_data(f"{BASE_DIR}/loss_range_vmpfc.csv")
    gain_indifference = load_data(f"{BASE_DIR}/gain_ind_vmpfc.csv")
    loss_indifference = load_data(f"{BASE_DIR}/loss_ind_vmpfc.csv")
    
    gain_range, loss_range, gain_indifference, loss_indifference = clean_data(gain_range, loss_range, gain_indifference, loss_indifference)

    gain_range = reset_data_frame(gain_range)
    loss_range = reset_data_frame(loss_range)
    gain_indifference = reset_data_frame(gain_indifference)
    loss_indifference = reset_data_frame(loss_indifference)

    gain_range["gain_or_loss"] = 1
    loss_range["gain_or_loss"] = 0
    gain_indifference["gain_or_loss"] = 1
    loss_indifference["gain_or_loss"] = 0
    
    df = pd.concat([gain_range, loss_range, gain_indifference, loss_indifference]).reset_index(drop=True)
    X = df.drop("gain_or_loss", axis='columns').fillna(0).to_numpy()
    y = df["gain_or_loss"].to_numpy()
    return X, y


In [21]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [22]:
X, y = load_all_data()

In [23]:
X.shape

(202, 3773)

In [24]:
X.shape

(202, 3773)

## Prepare training and test data

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
(X_train.shape, y_train.shape)

((161, 3773), (161,))

In [29]:
(X_test.shape, y_test.shape)

((41, 3773), (41,))

## Crossval

In [34]:
from sklearn.model_selection import cross_val_score

def crossval(estimator, X=X_train, y=y_train, k=10, scoring=None):
    print(f"Performing {k}-fold crossvalidation of {estimator}")
    cv_score = cross_val_score(estimator, X, y, cv=k, scoring=scoring)
    print(f"Scores: {cv_score}")
    print(f"Average: {np.mean(cv_score)}. Stddev: {np.std(cv_score)}")

    estimator.fit(X, y)

    return estimator


In [40]:
def acc(model):
    preds = model.predict(X_test)
    print("Final model - accuracy:", np.sum(preds == y_test) / preds.shape[0])

## Linear

In [41]:
model = crossval(SVC(kernel='linear', C=1.0, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(kernel='linear', random_state=42)
Scores: [0.70588235 0.75       0.6875     0.375      0.6875     0.625
 0.6875     0.6875     0.75       0.4375    ]
Average: 0.6393382352941177. Stddev: 0.12207266487785307
Final model - accuracy: 0.5365853658536586


In [42]:
model = crossval(SVC(kernel='linear', C=0.001, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(C=0.001, kernel='linear', random_state=42)
Scores: [0.64705882 0.6875     0.8125     0.375      0.8125     0.5
 0.75       0.6875     0.75       0.5625    ]
Average: 0.6584558823529412. Stddev: 0.13442465586913857
Final model - accuracy: 0.5365853658536586


In [43]:
model = crossval(SVC(kernel='linear', C=100, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(C=100, kernel='linear', random_state=42)
Scores: [0.70588235 0.75       0.6875     0.375      0.6875     0.625
 0.6875     0.6875     0.75       0.4375    ]
Average: 0.6393382352941177. Stddev: 0.12207266487785307
Final model - accuracy: 0.5365853658536586


## RBF

In [45]:
model = crossval(SVC(kernel='rbf', C=1.0, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(random_state=42)
Scores: [0.58823529 0.625      0.8125     0.5        0.6875     0.375
 0.75       0.6875     0.6875     0.625     ]
Average: 0.6338235294117647. Stddev: 0.11882680542950445
Final model - accuracy: 0.5365853658536586


In [46]:
model = crossval(SVC(kernel='rbf', C=0.001, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(C=0.001, random_state=42)
Scores: [0.52941176 0.5625     0.5625     0.5        0.5        0.5
 0.5        0.5        0.5        0.5       ]
Average: 0.5154411764705882. Stddev: 0.025086356042988204
Final model - accuracy: 0.43902439024390244


In [47]:
model = crossval(SVC(kernel='rbf', C=100, random_state=42))
acc(model)

Performing 10-fold crossvalidation of SVC(C=100, random_state=42)
Scores: [0.58823529 0.625      0.75       0.5        0.6875     0.375
 0.6875     0.625      0.6875     0.625     ]
Average: 0.6150735294117646. Stddev: 0.10287616119161179
Final model - accuracy: 0.5121951219512195


## Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

model = crossval(RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42))
acc(model)

Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=3, random_state=42)
Scores: [0.52941176 0.75       0.625      0.4375     0.625      0.5625
 0.5625     0.625      0.625      0.6875    ]
Average: 0.6029411764705882. Stddev: 0.08171364254004732
Final model - accuracy: 0.5365853658536586


In [49]:
model = crossval(RandomForestClassifier(n_estimators=200, max_depth=4, random_state=42))
acc(model)

Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=4, n_estimators=200, random_state=42)
Scores: [0.58823529 0.5        0.6875     0.5        0.625      0.375
 0.75       0.6875     0.625      0.6875    ]
Average: 0.6025735294117647. Stddev: 0.1083586303753264
Final model - accuracy: 0.4878048780487805


In [50]:
model = crossval(RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42))
acc(model)

Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=2, random_state=42)
Scores: [0.41176471 0.8125     0.6875     0.5        0.625      0.375
 0.6875     0.625      0.5        0.625     ]
Average: 0.5849264705882353. Stddev: 0.12889110250994198
Final model - accuracy: 0.5853658536585366


In [51]:
model = crossval(RandomForestClassifier(n_estimators=100, max_depth=1, random_state=42))
acc(model)

Performing 10-fold crossvalidation of RandomForestClassifier(max_depth=1, random_state=42)
Scores: [0.47058824 0.6875     0.625      0.375      0.75       0.5
 0.75       0.6875     0.6875     0.6875    ]
Average: 0.6220588235294118. Stddev: 0.12196576892278971
Final model - accuracy: 0.6097560975609756


## Neural Network

In [56]:
from sklearn.neural_network import MLPClassifier

model = crossval(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200, 100, 50, 20), random_state=42, verbose=False))
acc(model)

Performing 10-fold crossvalidation of MLPClassifier(alpha=1e-05, hidden_layer_sizes=(200, 100, 50, 20),
              random_state=42, solver='lbfgs')
Scores: [0.58823529 0.625      0.6875     0.4375     0.625      0.4375
 0.75       0.5625     0.75       0.5625    ]
Average: 0.6025735294117647. Stddev: 0.1046916557172376
Final model - accuracy: 0.6097560975609756
