In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score

In [27]:
def model(ratio,solver,shrinkage):
    # Load training and testing data
    X_train = np.loadtxt('X_train.csv', delimiter=',', skiprows=1) # each row is an observation with 6 features
    y_train = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)[:,1] # each row corresponds to actual value of observation from ^

    # Set up the model
    print("---")
    classification = LinearDiscriminantAnalysis(solver='lsqr',shrinkage='auto')
    # Partition data into train and test
    train_x, cross_validation_x_data, train_y, cross_validation_actual_y_values = train_test_split(X_train, y_train, test_size=(1-ratio), random_state=0)

    # Train model
    classification.fit(train_x, train_y)

    # Test model
    test_y = classification.predict(cross_validation_x_data)
    
    return (classification, test_y, cross_validation_actual_y_values)

def evaluate(classification, predicted_y_values, actual_y_values):
    accuracy = accuracy_score(actual_y_values, predicted_y_values)
    sensitivity = recall_score(actual_y_values, predicted_y_values)
    roc_auc_score_value = roc_auc_score(actual_y_values, predicted_y_values)
    print("Accuracy Rate = " + str(accuracy))
    print("Sensitivity Rate = " + str(sensitivity))
    print("ROC AUC Score= " + str(roc_auc_score_value))
    return (accuracy, sensitivity, roc_auc_score_value)


ratio = 0.76
solver='lsqr'
shrinkage='auto'
classification, test_y, cross_validation_actual_y_values = model(ratio,solver,shrinkage)
X_test = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)
y_pred = classification.predict(X_test)
accuracy, sensitivity, roc_auc_score_value = evaluate(classification, test_y, cross_validation_actual_y_values)

---
Accuracy Rate = 0.8541666666666666
Sensitivity Rate = 0.92
ROC AUC Score= 0.8513043478260869


In [28]:
# Arrange answer in two columns. First column (with header "Id") is an
# enumeration from 0 to n-1, where n is the number of test points. Second
# column (with header "EpiOrStroma" is the predictions.
test_header = "Id,EpiOrStroma"
n_points = X_test.shape[0]
y_pred_pp = np.ones((n_points, 2))
y_pred_pp[:, 0] = range(n_points)
y_pred_pp[:, 1] = y_pred
np.savetxt('my_submission.csv', y_pred_pp, fmt='%d', delimiter=",",
           header=test_header, comments="")

# Note: fmt='%d' denotes that all values should be formatted as integers which
# is appropriate for classification. For regression, where the second column
# should be floating point, use fmt='%d,%f'.
    