In [106]:
import scipy.special as sps
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from itertools import combinations, permutations
from numpy.linalg import inv
from collections import defaultdict
import math
import time
import keras

from keras.utils import np_utils
import keras.callbacks as cb
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop
from keras.datasets import mnist
from keras.utils import to_categorical

np.seterr(all='print')

{'divide': 'print', 'invalid': 'print', 'over': 'print', 'under': 'print'}

In [137]:
dummy_feature_List = ['country_group_CAN','country_group_EURO', 'country_group_USA', 'Position_C',
       'Position_D', 'Position_L', 'Position_R'] 
def standardize(s_df):
    for col in s_df.columns.values:
        col_mean = s_df[col].mean()
        col_std = s_df[col].std()

        #         mean of interaction terms of two discrete variable are zero. those columns are
        #         filtered while standizing as it cause singular matrix
        if (col_mean != 0 or col_std != 0) or col not in dummy_feature_List :
            s_df[col] = s_df[col].apply(lambda x: (x - col_mean) / float(col_std))

    return s_df


def preprocessing(x_df):
    x_df = pd.get_dummies(x_df, prefix=['country_group', 'Position'], columns=['country_group', 'Position'])
    x_df = x_df.apply(pd.to_numeric, args=('coerce',))

#     # # adding interaction terms
#     for col_1, col_2 in combinations(x_df.columns, 2):
#         cond1 = col_1 not in dummy_feature_List
#         cond2 = col_2 not in dummy_feature_List
#         if cond1 or cond2:
#             x_df['{}*{}'.format(col_1, col_2)] = np.multiply(x_df[col_1], x_df[col_2])
#     # x_df.to_csv('ex.csv', sep='\t')

# #     standardization
#     x_df = standardize(x_df)
# #     y_df = standardize(y_df)

    #     deleteing column = 0
    x_df = x_df.loc[:, (x_df != 0).any(axis=0)]
#     x_df.insert(loc=0, column='x0', value=1)

    return x_df

def calculate_negative_log_likelihood(target, weights, features):
    
    z = np.dot(weights,features)
    y = sps.expit(z)
    mask = np.isinf(y)
    y[mask] = -z[mask]
    
    likelihood = np.mean(np.sum((1-target)*z + np.log(y)))
    
    return likelihood


def sigmoid(x):
    
    return 1 / (1 + np.exp(-x))


In [138]:
def load_dataset():
    input = os.path.join("../MLAssign2/Least-Square-Regression/",'Model_Trees_Full_Dataset', 'preprocessed_datasets.csv')
    data = pd.read_csv(input)

    # random shuffle
    data = data.iloc[np.random.permutation(len(data))]
    # data[u'GP_greater_than_0'] = data[u'GP_greater_than_0'].map({'yes': 1, 'no': 0})

    training_df = data[data[u'DraftYear'].isin([2004, 2005, 2006])]
    testing_df = data[data[u'DraftYear'] == 2007]

    drop_class = [u'id', u'Country', u'Overall', u'PlayerName', u'sum_7yr_TOI', u'DraftYear',"GP_greater_than_0"]
    training_df.drop(drop_class, inplace=True, axis=1)
    testing_df.drop(drop_class, inplace=True, axis=1)

    y_train_df = training_df.filter([u'sum_7yr_GP'])
    x_train_df = training_df.drop([u'sum_7yr_GP'], axis=1)

    y_test_df = testing_df.filter([u'sum_7yr_GP'])
    x_test_df = testing_df.drop([u'sum_7yr_GP'], axis=1)

    x_train_df_processed = preprocessing(x_train_df)
    x_test_df_processed = preprocessing(x_test_df)

    y_train = y_train_df.values
    x_train = x_train_df_processed.values
    y_test = y_test_df.values
    x_test = x_test_df_processed.values
    
    return (x_train,y_train), (x_test, y_test)

In [139]:
tol = 0.00001

# Step size for gradient descent.
etas = [0.5, 0.3, 0.1, 0.05, 0.01]
# etas = [0.1, 0.05, 0.01]

(x_train,y_train), (x_test, y_test) = load_dataset()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [140]:
x_train

array([[ 20,  73, 205, ...,   1,   0,   0],
       [ 19,  74, 201, ...,   0,   1,   0],
       [ 18,  71, 194, ...,   0,   0,   0],
       ..., 
       [ 20,  73, 207, ...,   0,   0,   0],
       [ 19,  71, 183, ...,   0,   0,   0],
       [ 19,  75, 203, ...,   0,   0,   0]])

In [141]:
print x_train.shape
print y_train.shape
print x_test.shape
print y_test.shape

(637, 22)
(637, 1)
(191, 22)
(191, 1)


In [147]:
start_time = time.time()
model = Sequential()
model.add(Dense(units=200, input_dim=22, activation='relu' ))
model.add(Dense(units=100, activation='relu' ))
model.add(Dense(1, activation='relu'))
# model.compile(loss="mean_squared_error", optimizer ='adam', metrics=['accuracy'])

# rms = RMSprop()
sgd = keras.optimizers.SGD(lr=1)
model.compile(loss='mean_absolute_error', optimizer=sgd, metrics=['accuracy'])
print 'Model compield in {0} seconds'.format(time.time() - start_time)

Model compield in 0.0866219997406 seconds


In [148]:
model.fit(x_train, y_train, epochs=10, batch_size=64)
# model.fit(x_test, y_test, epochs=150,batch_size=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f011a14fc50>

In [149]:
# show the accuracy on the testing set
print("[INFO] evaluating on testing set...")
(loss, accuracy) = model.evaluate(x_test, y_test,
	batch_size=128, verbose=1)
print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss,
	accuracy * 100))

[INFO] evaluating on testing set...


In [24]:
print y_test

[[  9]
 [  0]
 [203]
 [  0]
 [  0]
 [  0]
 [  0]
 [415]
 [136]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [515]
 [324]
 [189]
 [  1]
 [  0]
 [282]
 [  0]
 [  0]
 [ 35]
 [  0]
 [108]
 [  0]
 [  1]
 [ 18]
 [ 25]
 [  0]
 [203]
 [  0]
 [124]
 [  0]
 [ 87]
 [297]
 [ 11]
 [  0]
 [ 25]
 [136]
 [  0]
 [  0]
 [246]
 [  3]
 [ 79]
 [  0]
 [  0]
 [  0]
 [  0]
 [449]
 [  0]
 [  0]
 [ 58]
 [  0]
 [ 10]
 [ 13]
 [  0]
 [  0]
 [  0]
 [  0]
 [ 87]
 [132]
 [  0]
 [  0]
 [  0]
 [  0]
 [176]
 [ 29]
 [  0]
 [ 63]
 [ 75]
 [  0]
 [  0]
 [  0]
 [119]
 [  0]
 [  0]
 [157]
 [  0]
 [  0]
 [ 81]
 [283]
 [ 67]
 [ 61]
 [154]
 [  0]
 [481]
 [  0]
 [  0]
 [  0]
 [  7]
 [  0]
 [  0]
 [  0]
 [ 72]
 [102]
 [110]
 [  0]
 [  0]
 [  0]
 [  1]
 [  0]
 [  0]
 [  0]
 [  5]
 [316]
 [  0]
 [  2]
 [ 54]
 [  0]
 [297]
 [217]
 [164]
 [284]
 [246]
 [ 28]
 [344]
 [ 40]
 [  0]
 [319]
 [286]
 [ 72]
 [ 16]
 [113]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [ 55]
 [  0]
 [106]
 [  0]
 [132]
 [  0]
 [  0]
 [ 25]
 [  0]