In [87]:
#!/usr/bin/env python
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import math
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split 

In [96]:
def model(ratio):
    # Load training and testing data
    X_train = np.loadtxt('X_train.csv', delimiter=',', skiprows=1) # each row is an observation with 6 features
    y_train = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)[:,1] # each row corresponds to actual value of observation from ^

    # Set up the model
    regression = linear_model.LinearRegression() 

    # Partition data into train and test
    train_x, cross_validation_x_data, train_y, cross_validation_actual_y_values = train_test_split(X_train, y_train, test_size=(1-ratio), random_state=0)

    # Train model
    regression.fit(train_x, train_y)
    
    print(regression.coef_)


    # Test model
    test_y = regression.predict(cross_validation_x_data)

    mse = mean_squared_error(cross_validation_actual_y_values, test_y)
    rmse = math.sqrt(mse)

    print("Root mean square error = " + str(rmse))

    coeff_det = regression.score(cross_validation_x_data, cross_validation_actual_y_values)

    print("Coefficient of determination R^2 = " + str(coeff_det))
    
    return (rmse, coeff_det, regression)


ratios = np.linspace(0.01, 0.99, 99)
# print(ratios)
# ratios = [0.64]
ratios = [0.79]
y_pred_pp = []
for ratio in ratios:
    print(ratio)
    rmse, coeff, regression = model(ratio)
    X_test = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)
    y_pred_pp = regression.predict(X_test)
    negatives = np.sum(np.array(y_pred_pp) < 0)
#     print("Number of negative predictions = " + str(negatives))

0.79
[ 0.04675011  0.00810348  0.00774909  0.62797244 -2.07535857  1.42633647]
Root mean square error = 75.69841108955181
Coefficient of determination R^2 = 0.8772425202312105


In [95]:
# Arrange answer in two columns. First column (with header "Id") is an
# enumeration from 0 to n-1, where n is the number of test points. Second
# column (with header "EpiOrStroma" is the predictions.
test_header = "Id,PRP"
n_points = X_test.shape[0]
y_pred_pp = np.ones((n_points, 2))
y_pred_pp[:, 0] = range(n_points)
y_pred_pp[:, 1] = y_pred
np.savetxt('my_submission.csv', y_pred_pp, fmt='%d', delimiter=",",
           header=test_header, comments="")

# Note: fmt='%d' denotes that all values should be formatted as integers which
# is appropriate for classification. For regression, where the second column
# should be floating point, use fmt='%d,%f'.
    