In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

# Import our input dataset
all_start_df = pd.read_csv("../Data/Clean_Data/cleaner_start_nhl.csv", sep=",", encoding='latin-1')
all_start_df.head()

Unnamed: 0,Player,sport,position,start_year,start_age,years_played
0,A.J. Greer\greeraj01,Hockey,Left Wing,2017,20,1
1,Aaron Ekblad\ekblaaa01,Hockey,Defense,2015,18,3
2,Aaron Gagnon\gagnoaa01,Hockey,Center,2010,23,4
3,Aaron MacKenzie\mackeaa01,Hockey,Defense,2009,27,1
4,Andreas Karlsson\karlsan01,Hockey,Center,2007,31,2


In [2]:
del all_start_df['Player']
# del all_start_df['start_year']
# del all_start_df['start_age']
all_start_df.head()

Unnamed: 0,sport,position,start_year,start_age,years_played
0,Hockey,Left Wing,2017,20,1
1,Hockey,Defense,2015,18,3
2,Hockey,Center,2010,23,4
3,Hockey,Defense,2009,27,1
4,Hockey,Center,2007,31,2


In [3]:
# Generate our categorical variable list
start_cat = all_start_df.dtypes[all_start_df.dtypes == "object"].index.tolist()


# Check the number of unique values in each column
all_start_df[start_cat].nunique()

sport       1
position    4
dtype: int64

In [4]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(all_start_df[start_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(start_cat)
encode_df.head()

Unnamed: 0,sport_Hockey,position_Center,position_Defense,position_Left Wing,position_Right Wing
0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0


In [5]:
# Merge one-hot encoded features and drop the originals
all_start_df = all_start_df.merge(encode_df,left_index=True, right_index=True)
all_start_df = all_start_df.drop(start_cat,1)
all_start_df.head()

Unnamed: 0,start_year,start_age,years_played,sport_Hockey,position_Center,position_Defense,position_Left Wing,position_Right Wing
0,2017,20,1,1.0,0.0,0.0,1.0,0.0
1,2015,18,3,1.0,0.0,1.0,0.0,0.0
2,2010,23,4,1.0,1.0,0.0,0.0,0.0
3,2009,27,1,1.0,0.0,1.0,0.0,0.0
4,2007,31,2,1.0,1.0,0.0,0.0,0.0


In [6]:
# Remove years_played target from features data
y = all_start_df.years_played.values
X = all_start_df.drop(columns=["years_played"]).values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create the SVM model
# svm = SVC(kernel='linear')

# Train the model
# svm.fit(X_train, y_train)

# Evaluate the model
# y_pred = svm.predict(X_test)
# print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [8]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  125
hidden_nodes_layer2 = 25

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
nn.output_shape

(None, 1)

In [9]:
X_test_scaled[5]

array([ 0.64776276, -0.48387494,  0.        ,  1.46236253, -0.71179276,
       -0.48255757, -0.43028647])

In [10]:
y_train

array([2, 1, 8, ..., 4, 3, 3], dtype=int64)

In [11]:
# Train the model 
fit_model = nn.fit(X_train_scaled, y_train, epochs=20) 
# Evaluate the model using the test data 
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
12/12 - 0s - loss: 0.0000e+00 - accuracy: 0.3413
Loss: 0.0, Accuracy: 0.341269850730896


In [12]:
nn.predict(X_test_scaled,verbose=2)

12/12 - 0s


array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],