In [178]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import os
import pandas
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import numpy

In [179]:
path = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
data_path = os.path.join(path, 'data')

def explorer(name):
    name = name + '.csv'
    path = os.path.join(data_path, name)
    return path if os.path.exists(path) else None

In [180]:
data = pandas.read_csv(explorer('combined'))
columns = ['height', 'weight', 'spike', 'block', 'position_number']
players = pandas.DataFrame(data=data, columns=columns)

In [181]:
def classify_players(position):
    # Separate players between
    # attacking, defensive
    # and mixed
    if position == 6:
        return 1
    else:
        return 2
players['classification'] = players['position_number'].apply(classify_players)
players

Unnamed: 0,height,weight,spike,block,position_number,classification
0,173,69,268,260,3,2
1,180,70,285,275,3,2
2,182,69,283,273,3,2
3,184,74,294,282,2,2
4,181,60,290,272,2,2
...,...,...,...,...,...,...
399,190,66,296,287,4,2
400,176,54,280,279,6,1
401,194,82,309,255,3,2
402,181,73,287,280,1,2


In [193]:
players.groupby(['classification'])['height'].count()

classification
1     45
2    359
Name: height, dtype: int64

In [197]:
players[players['classification'] == 1].describe()

Unnamed: 0,height,weight,spike,block,position_number,classification
count,45.0,45.0,45.0,45.0,45.0,45.0
mean,170.266667,61.111111,272.622222,261.688889,6.0,1.0
std,6.228235,6.912512,18.700132,20.691261,0.0,0.0
min,150.0,51.0,198.0,190.0,6.0,1.0
25%,166.0,56.0,268.0,255.0,6.0,1.0
50%,170.0,60.0,274.0,265.0,6.0,1.0
75%,175.0,65.0,285.0,274.0,6.0,1.0
max,180.0,79.0,302.0,291.0,6.0,1.0


In [182]:
X = players[['height']]
y = players['classification']

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [184]:
model = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=50)
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=50, splitter='best')

In [185]:
predictions = model.predict(X_test)
predictions[:5]

array([2, 2, 2, 2, 2], dtype=int64)

In [186]:
# number of correct predictions / total number of predictions
score = model.score(X_train, y_train)
print('The percentage of correct predictions we got right is %.3f' % score)

The percentage of correct predictions we got right is 0.921


In [187]:
metrics.accuracy_score(y_test, predictions)

0.8811881188118812

In [188]:
# Understand what errors the model is making by
# displaying true positives and false positives
confusion_matrix = metrics.confusion_matrix(y_test, predictions)
print('Confusion matrix: %s' % confusion_matrix)
# true positives / true positives + false positives
precision_score = metrics.precision_score(y_test, predictions, average='weighted')
print('Precision score: %s' % precision_score)
# true positives / true positives + false negatives
recall_score = metrics.recall_score(y_test, predictions, average='weighted')
print('Recall score: %s' % recall_score)
# Combines both precision_score and recall_score
# in an average
f1score = metrics.f1_score(y_test, predictions, average='weighted')
print('F1 score is %s' % f1score)

fbeta = metrics.fbeta_score(y_test, predictions, 1)
print('FBeta score: %s' % fbeta)
# metrics.matthews_corrcoef(y_test, predictions)

Confusion matrix: [[ 0 12]
 [ 0 89]]
Precision score: 0.776492500735222
Recall score: 0.8811881188118812
F1 score is 0.8255341323606045
FBeta score: 0.0


In [189]:
observation = [[205]]
prediction = model.predict(observation)
if prediction[0] == 1:
    player_type = 'defensive'
else:
    player_type = 'attacking'
print(f'The player is a(n) {player_type} player.')

The player is a(n) attacking player.


In [134]:
model.decision_path(X)

<404x47 sparse matrix of type '<class 'numpy.int64'>'
	with 2361 stored elements in Compressed Sparse Row format>

In [190]:
export_graphviz(model, out_file='tree')