In [1]:
import pandas as pd

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [8]:
## code out the model
## first import our csv of our data of the best 500 hitters.

In [9]:
df = pd.read_csv('500hits.csv', encoding = 'latin-1')

In [12]:
## lets take a look at our data
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [13]:
## drop columns that aren't needed with our data - players and SB (stealing bases) because we just want numbers
df = df.drop(columns = ['PLAYER', 'CS'])

In [22]:
## display they're in fact dropped
df.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1


In [23]:
## what causes someone to be in HOF (hall of fame), thus lets make our data X (YRS, G, AB, R, H, 28, ... there are 13)
X = df.iloc[:, 0:13]

In [24]:
## instead of 0-13, just 13 because we're grabbing the HOF colm only
y = df.iloc[:,13]

In [25]:
## first train_test_split - split our data. this is going to randomized our data.
## i want to have 20% for the testing and 80% for the training.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 11, test_size=0.2)

In [26]:
## add in our scaler
scaler = MinMaxScaler(feature_range=(0,1))

In [27]:
## fit our data to the scaler based on the MinMaxScaler which has values from 0 and 1
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [28]:
## add the KNN
knn = KNeighborsClassifier(n_neighbors=8)

In [29]:
## fit our data to this model, all we have to do is scan it
knn.fit(X_train, y_train)

In [31]:
## make some predictions
y_pred = knn.predict(X_test)

In [32]:
## print out predications. hof vs. non hof
print(y_pred)

[0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0
 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 0 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0
 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1]


In [33]:
## more accurate
knn.score(X_test, y_test)

0.8279569892473119

In [37]:
## give you lots more insight on the model
cm = confusion_matrix(y_test, y_pred)

In [38]:
## 55 - true positive predicated hof and is a hof. 22 - true negative predicated not to be hof and was not a hof
## 12 - false positive predicated to be hof and is not a hof. 4 - false negative
print(cm)

[[55 12]
 [ 4 22]]


In [39]:
cr = classification_report(y_test, y_pred)

In [43]:
## precision - 55 / 59 (false negative) = 0.93
## recall - true positive / (true positive + false positive) - 55 + 12 = 67 then divide that by 59. = 0.8
## f1-score - 2 * precision * recall  = 1.5252 and then divide that by (precision(0.93) + recall (0.82) = 1.75 finally, 1.5252 / 1.75
## support - 55 + 12 = 67, 4 + 22 = 26.
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87        67
           1       0.65      0.85      0.73        26

    accuracy                           0.83        93
   macro avg       0.79      0.83      0.80        93
weighted avg       0.85      0.83      0.83        93



In [46]:
print(knn.n_samples_fit_)

372
