In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.optimize as opt
%matplotlib inline

data = pd.read_csv('C:\\Users\\Owner\\Napa\\results_model_data_8.csv')

In [2]:
def result_assign(win_margin):
    # This function converts the win_margin column into a binary win/loss result
    if win_margin>0:
        return 1
    else:
        return 0

In [3]:
def sigmoid(z):
    # Computes the sigmoid function for logistic regression
    return 1 / (1 + np.exp(-z))

In [4]:
def cost(theta, X, y):
    # Computes the cost function for logistic regression
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))

In [5]:
def gradient(theta, X, y):
    # Computes the gradient of the cost function for logistic regression
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        grad[i] = np.sum(term) / len(X)
    
    return grad

In [6]:
def predict(theta, X):
    # Uses the minimized theta parameter to generate predictions based on model
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]

In [7]:
def get_accuracy(predictions, y):
    # Compares the model predictions to the real data and returns accuracy
    correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
    return sum(map(float, correct)) / float(len(correct))*100

In [8]:
# Add a new binary column to the data, which has value 1 where the result is positive, and 0 if negative
data['Result'] = data.apply(lambda x: result_assign(x['Win Margin']),axis=1)
# Select only quantitive paramaters to be used in the model
model_data = data[['Race Margin', 'Win % Margin', 'Skill Margin', 'Game Margin', 'AvgPPM Margin', 'Result']]
model_data.head()

Unnamed: 0,Race Margin,Win % Margin,Skill Margin,Game Margin,AvgPPM Margin,Result
0,0.0,5.769231,-3,-64,-0.02,0
1,-1.0,5.769231,-3,-64,-0.02,1
2,0.0,-1.388889,6,-69,-0.73,1
3,0.0,-1.388889,6,-69,-0.73,0
4,0.0,2.040816,1,51,-0.42,0


In [9]:
# add a ones column - this makes the matrix multiplication work out easier
model_data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = model_data.shape[1]
X = model_data.iloc[:,0:cols-1]
y = model_data.iloc[:,cols-1:cols]
# Split the data into training and validation sets with 80/20 ratio
train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state = 0)

# convert to numpy arrays and initalize the parameter array theta
X_train = np.array(train_X.values)
y_train = np.array(train_y.values)
X_val = np.array(val_X.values)
y_val = np.array(val_y.values)
theta = np.zeros(cols-1)

In [10]:
# Use a TNC optimization algorithm to minimize the cost function
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X_train, y_train))
cost(result[0], X_train, y_train)

0.6182440340024586

In [11]:
# Convert theta_min to a matrix and retrieve the training and validation accuracies
theta_min = np.matrix(result[0])
train_predictions = predict(theta_min, X_train)
val_predictions = predict(theta_min, X_val)
train_accuracy = get_accuracy(train_predictions, y_train)
val_accuracy = get_accuracy(val_predictions, y_val)
print 'Train accuracy = {0}%'.format(train_accuracy)
print 'Validation accuracy = {0}%'.format(val_accuracy)

Train accuracy = 64.8042948998%
Validation accuracy = 63.620474407%


In [12]:
theta_min

matrix([[ 4.54210216e-02, -3.07343917e-01,  4.71893190e-02,
          3.10746481e-02, -1.93209569e-04, -6.66449063e-02]])