In [1]:
import pandas as pd
import numpy as np

# Load data

In [14]:
TASK_TYPES = ["indifference", "range"]
BASE_DIR = "narpsdata/agh"

def load_csv(filename):
    df = pd.read_csv(filename, delimiter=" ", header=None)
    df = df.transpose() # row = subject
    df = df.drop([0, 1, 2], axis='index') # drop rows with x,y,z
    df = df.reset_index(drop=True)
    return df

def train_test_split(data, test_size=0.2):
    msk = np.random.rand(len(data)) < (1 - test_size)
    train = data[msk]
    test = data[~msk]

    return train.drop("label", axis='columns').to_numpy(), train["label"].to_numpy(), test.drop("label", axis='columns').to_numpy(), test["label"].to_numpy()

def load_task_data(task, test_size=0.2):
    if task not in TASK_TYPES:
        raise Exception(f"Available tasks: {TASK_TYPES}")

    gain = load_csv(f"{BASE_DIR}/gain_{task}.csv")
    gain["label"] = 1
    loss = load_csv(f"{BASE_DIR}/loss_{task}.csv")
    loss["label"] = 0
    df = pd.concat([gain, loss])
    return train_test_split(df.reset_index(drop=True), test_size=test_size)
X_train, y_train, X_test, y_test = load_task_data("range")

# Prepare model

In [15]:
X_train, y_train, X_test, y_test = load_task_data("range")
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8888888888888888


In [12]:
import pandas as pd
import numpy as np
BASE_DIR = "narpsdata/agh"
common_cols = [0, 1, 2]

def load_data(filename):
    df = pd.read_csv(filename, delimiter=" ", header=None)
    return df

def merge_and_fillna(reference_df, target_df):
    merged_df = pd.merge(reference_df, target_df, on=common_cols, how='left')
    merged_df = merged_df.fillna(0)
    return merged_df

def clean_data(gain_range, loss_range, gain_indifference, loss_indifference):
    ranges = {}
    for col in common_cols:
        all_values = pd.concat([gain_range[col], loss_range[col], gain_indifference[col], loss_indifference[col]])
        ranges[col] = (all_values.min(), all_values.max())
    all_coords = pd.DataFrame([(i, j, k) for i in range(int(ranges[0][0]), int(ranges[0][1]) + 1)
                                            for j in range(int(ranges[1][0]), int(ranges[1][1]) + 1)
                                            for k in range(int(ranges[2][0]), int(ranges[2][1]) + 1)],
                                            columns=common_cols)
    new_gain_range = merge_and_fillna(all_coords, gain_range)
    new_loss_range = merge_and_fillna(all_coords, loss_range)
    new_gain_indifference = merge_and_fillna(all_coords, gain_indifference)
    new_loss_indifference = merge_and_fillna(all_coords, loss_indifference)
    return new_gain_range, new_loss_range, new_gain_indifference, new_loss_indifference

def reset_data_frame(df):
    df = df.transpose() # row = subject
    df = df.drop(common_cols, axis='index') # drop rows with x,y,z
    df = df.reset_index(drop=True)
    return df

def train_test_split(data, test_size=0.2):
    msk = np.random.rand(len(data)) < (1 - test_size)
    train = data[msk]
    test = data[~msk]

    return train.drop("gain_or_loss", axis='columns').to_numpy(), train["gain_or_loss"].to_numpy(), test.drop("gain_or_loss", axis='columns').to_numpy(), test["gain_or_loss"].to_numpy()

def load_task_data(test_size=0.2):
    gain_range = load_data(f"{BASE_DIR}/gain_range.csv")
    loss_range = load_data(f"{BASE_DIR}/loss_range.csv")
    gain_indifference = load_data(f"{BASE_DIR}/gain_indifference.csv")
    loss_indifference = load_data(f"{BASE_DIR}/loss_indifference.csv")
    
    gain_range, loss_range, gain_indifference, loss_indifference = clean_data(gain_range, loss_range, gain_indifference, loss_indifference)
    gain_range = reset_data_frame(gain_range)
    loss_range = reset_data_frame(loss_range)
    gain_indifference = reset_data_frame(gain_indifference)
    loss_indifference = reset_data_frame(loss_indifference)

    gain_range["gain_or_loss"] = 1
    loss_range["gain_or_loss"] = 0
    gain_indifference["gain_or_loss"] = 1
    loss_indifference["gain_or_loss"] = 0
#     gain_range["range_or_indifference"] = 1
#     loss_range["range_or_indifference"] = 1
#     gain_indifference["range_or_indifference"] = 0
#     loss_indifference["range_or_indifference"] = 0
    
    df = pd.concat([gain_range, loss_range, gain_indifference, loss_indifference])
    split_data =  train_test_split(df.reset_index(drop=True), test_size=test_size)
    return split_data


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

X_train, y_train, X_test, y_test = load_task_data(0.2)
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9183673469387755
