In [7]:
# Import packages: 

import feather
import pandas as pd
import numpy as np

# sklearn :
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import random
%matplotlib inline


### Read Data from Feather

In [57]:
dat = feather.read_dataframe('../data/sub_data.feather')
dat.head(5)

Unnamed: 0,racenum,pos,hnum,odds,date,name,driver,trainer,seconds,temp,cond,winner
0,1,2,6,2.6,2015-11-23,Ryder,Asher,Quincy,116.2,24.0,FT,0.0
1,1,5,1,4.85,2015-11-23,Ashlee,Zane,Carol,117.2,24.0,FT,0.0
2,1,6,3,4.0,2015-11-23,Carmen,Theresa,Brian,117.4,24.0,FT,0.0
3,2,4,4,28.6,2015-11-23,Rowland,Taryn,Quincy,117.0,24.0,FT,0.0
4,2,5,5,0.3,2015-11-23,Noe,Theresa,Braylon,118.0,24.0,FT,0.0


In [9]:
le = LabelEncoder()

In [25]:
enc_cols = ['date', 'name', 'driver', 'trainer', 'cond']
exp_cols = ['date', 'name', 'driver', 'trainer', 'cond', 'temp', 'hnum']

In [40]:
X = dat[exp_cols]    # these are the features
y = dat['winner']    # these are the labels (1: win 0: not win)
y_multi = dat['pos'] # these are the labels for multinomial logistic regression
X[enc_cols] = X[enc_cols].apply(le.fit_transform) # encode the string value columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


### Fit Logistic Regression Model:

In [86]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()

model.fit(X_train, y_train)
#multi_class=multinomial

print("Logistic Regression training error: %f" % (1.0-model.score(X_train,y_train)))
print("Logistic Regression test error: %f" % (1.0-model.score(X_validation,y_validation)))

Logistic Regression training error: 0.175367
Logistic Regression test error: 0.198697


In [87]:
y_pred = model.predict(X_train)

In [88]:
print("Predicted values are all 0 as the following ratio is", len(y_pred[y_pred == 0])/len(y_pred))

Predicted values are all 0 as the following ratio is 1.0


In [90]:
print("Not winner percentage in training set", round(len(y_train[y_train == 0])/len(y_train),2))

Not winner percentage in training set 0.82


### Fit Multiclass Logistic Regression Model:

In [65]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y_multi, test_size=0.3)

model_multi = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')

model_multi.fit(X_train, y_train)
#multi_class=multinomial

print("Multinomial Logistic Regression training error: %f" % (1.0-model_multi.score(X_train,y_train)))
print("Multinomial Logistic Regression test error: %f" % (1.0-model_multi.score(X_validation,y_validation)))

Multinomial Logistic Regression training error: 0.798695
Multinomial Logistic Regression test error: 0.843478


In [76]:
y_pred_multi = model_multi.predict(X_train)

In [83]:
print("Predicted winner as a percentage:", round(len(y_pred_multi[y_pred_multi == 1])/len(y_pred_multi),2))
print("Winner percentage in training set:", round(len(y_train[y_train == 1])/len(y_train),2))

Predicted winner as a percentage: 0.24
Winner percentage in training set: 0.17
