In [1]:
#Libraries
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from pandas import pivot_table
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
#Datsets - 5 years of historical data
#Data Source - Retrosheet: http://www.retrosheet.org/gamelogs/index.html
df2013 = pd.read_csv('2013.csv')
df2014 = pd.read_csv('2014.csv')
df2015 = pd.read_csv('2015.csv')
df2016 = pd.read_csv('2016.csv')
df2017 = pd.read_csv('2017.csv')

frames = [df2013,df2014,df2015,df2016,df2017]
dfOriginal = pd.concat(frames)

dfOriginal.head(5)

Unnamed: 0,Date,Number of Game,Day of Week,Visiting Team,League,Visiting team game number,Home Team,League.1,Home Team Game Number,Visiting Score,...,Home starting players Name (7),Home Player Defensive Position (7),Home starting players ID (8),Home starting players Name (8),Home Player Defensive Position (8),Home starting players ID (9),Home starting players Name (9),Home Player Defensive Position (9),Additional Info,Acquisition Info
0,20130331,0,Sun,TEX,AL,1,HOU,AL,1,2,...,Matt Dominguez,5,barnb002,Brandon Barnes,9,ceder002,Ronny Cedeno,6,,Y
1,20130401,0,Mon,KCA,AL,1,CHA,AL,1,0,...,Alexei Ramirez,6,flowt001,Tyler Flowers,2,beckg001,Gordon Beckham,4,,Y
2,20130401,0,Mon,DET,AL,1,MIN,AL,1,4,...,Chris Parmelee,9,dozib001,Brian Dozier,4,florp001,Pedro Florimon,6,,Y
3,20130401,0,Mon,BOS,AL,1,NYA,AL,1,8,...,Ichiro Suzuki,9,nix-j001,Jayson Nix,5,cervf001,Francisco Cervelli,2,,Y
4,20130401,0,Mon,SEA,AL,1,OAK,AL,1,2,...,Josh Donaldson,5,smits002,Seth Smith,10,sogae001,Eric Sogard,4,,Y


In [3]:
#Adding Year
dfOriginal['Date'] = dfOriginal.Date.apply(str)
dfOriginal['Year'] = dfOriginal['Date'].str[:4]#.apply(str.slice(0,5))

#dfOriginal.head(5)

In [4]:
#Functions to calculate wins and losses
def visitingWinIndicator(row):
    if row['Visiting Score'] < row['Home Score']:
        val = 0
    else:
        val = 1
    return val

def homeWinIndicator(row):
    if row['Home Score'] < row['Visiting Score']:
        val = 0
    else:
        val = 1
    return val

dfOriginal['HomeTeamWin'] = dfOriginal.apply(homeWinIndicator, axis=1)
dfOriginal['VisitingTeamWin'] = dfOriginal.apply(visitingWinIndicator, axis=1)

In [5]:
#Creating a subset of our data

dfSummary = dfOriginal[[
                        'Year','Visiting Team','Visiting team game number', 'Home Team', 'Home Team Game Number', 
                        'Visiting Score', 'Home Score', 'HomeTeamWin','VisitingTeamWin', 'Home Hits', 'Visiting Hits',
                       'Home pitchers used', 'Visiting pitchers used', 'Home put outs', 'Visiting put outs',
                        'Home team earned runs', 'Visiting team earned runs', 'Home Walks', 'Visiting Walks',
                        'Home Doubles', 'Home Triples', 'Home Homeruns',
                        'Visiting Doubles', 'Visiting Triples', 'Visiting Homeruns',
                        'Home At-Bats', 'Visiting At-Bats'
                       ]]
dfSummary = dfSummary[(dfSummary['Home Team Game Number']  <=25) | (dfSummary['Visiting team game number']  <=25)]

In [9]:
mlbPivot = dfSummary.pivot_table(values=
                                 ('HomeTeamWin',
                                 'VisitingTeamWin',
                                 'Home Score',
                                 'Visiting Score',
                                 'Home Hits',
                                 'Visiting Hits',
                                 'Home pitchers used',
                                 'Visiting pitchers used',
                                  'Home put outs',
                                  'Visiting put outs',
                                  'Home team earned runs',
                                  'Visiting team earned runs',
                                  'Home Walks',
                                  'Visiting Walks',
                                  'Home Doubles', 
                                  'Home Triples', 
                                  'Home Homeruns',
                                  'Visiting Doubles', 
                                  'Visiting Triples', 
                                  'Visiting Homeruns',
                                  'Home At-Bats',
                                  'Visiting At-Bats'
                                 ), index=('Year','Home Team','Visiting Team'),
                                 aggfunc={
                                     'HomeTeamWin': 'sum',
                                     'VisitingTeamWin': 'sum',
                                     'Home Score': 'sum',
                                     'Visiting Score': 'sum',
                                     'Home Hits': 'sum',
                                     'Visiting Hits': 'sum',
                                     'Home pitchers used': 'mean',
                                     'Visiting pitchers used': 'mean',
                                     'Home put outs': 'sum',
                                     'Visiting put outs': 'sum',
                                     'Home team earned runs': 'sum',
                                     'Visiting team earned runs':'sum',
                                     'Home Walks': 'sum',
                                     'Visiting Walks': 'sum',
                                      'Home Doubles': 'sum', 
                                      'Home Triples': 'sum', 
                                      'Home Homeruns': 'sum',
                                      'Visiting Doubles': 'sum', 
                                      'Visiting Triples': 'sum', 
                                      'Visiting Homeruns': 'sum',
                                      'Home At-Bats': 'sum',
                                      'Visiting At-Bats': 'sum'
                                 })
mlbStage = mlbPivot.reset_index()

In [11]:
wins = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['HomeTeamWin'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['VisitingTeamWin'].sum()).reset_index()
wins.columns = ['Year','Team','Wins']

losses = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['VisitingTeamWin'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['HomeTeamWin'].sum()).reset_index()
losses.columns = ['Year','Team','Losses']

runsFor = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Home Score'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Score'].sum()).reset_index()
runsFor.columns = ['Year','Team','runsFor']

runsAgainst = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Visiting Score'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Home Score'].sum()).reset_index()
runsAgainst.columns = ['Year','Team','runsAgainst']


hitsFor = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Home Hits'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Hits'].sum()).reset_index()
hitsFor.columns = ['Year','Team','hitsFor']

hitsAgainst = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Visiting Hits'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Home Hits'].sum()).reset_index()
hitsAgainst.columns = ['Year','Team','hitsAgainst']


pitchersUsed = pd.DataFrame(((mlbStage.groupby(['Year','Home Team'])['Home pitchers used'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting pitchers used'].sum())
                    /(mlbStage.groupby(['Year','Home Team'])['Home pitchers used'].count()
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting pitchers used'].count())
                    )).reset_index()
pitchersUsed.columns = ['Year','Team','avgPitchersUsed']
pitchersUsed = pitchersUsed.round(decimals=2)



totalOuts = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Home put outs'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting put outs'].sum()).reset_index()
totalOuts.columns = ['Year','Team','totalOuts']
#inningsPitched = pitchersUsed.round(decimals=2)
    
earnedRunsAllowed = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Visiting team earned runs'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Home team earned runs'].sum()).reset_index()
earnedRunsAllowed.columns = ['Year','Team','earnedRunsAllowed']


totalWalksAllowed = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Visiting Walks'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Home Walks'].sum()).reset_index()
totalWalksAllowed.columns = ['Year','Team','totalWalksAllowed']

totalWalksEarned = pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Home Walks'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Walks'].sum()).reset_index()
totalWalksEarned.columns = ['Year','Team','totalWalksEarned']

totalExtraBaseHits = pd.DataFrame(
                    mlbStage.groupby(['Year','Home Team'])['Home Doubles'].sum() 
                    + mlbStage.groupby(['Year','Home Team'])['Home Triples'].sum()
                    + mlbStage.groupby(['Year','Home Team'])['Home Homeruns'].sum()
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Doubles'].sum()
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Triples'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting Homeruns'].sum() 
                                 ).reset_index()
totalExtraBaseHits.columns = ['Year','Team','totalExtraBaseHits']

atBats =  pd.DataFrame(mlbStage.groupby(['Year','Home Team'])['Home At-Bats'].sum() 
                    + mlbStage.groupby(['Year','Visiting Team'])['Visiting At-Bats'].sum()).reset_index()
atBats.columns = ['Year','Team', 'atBats']

master = pd.merge(wins, losses, on=['Year','Team'])
master = pd.merge(master, runsFor, on=['Year','Team'])
master = pd.merge(master, runsAgainst, on=['Year','Team'])
master = pd.merge(master, hitsFor, on=['Year','Team'])
master = pd.merge(master, hitsAgainst, on=['Year','Team'])
master = pd.merge(master, pitchersUsed, on=['Year','Team'])
master = pd.merge(master, earnedRunsAllowed, on=['Year','Team'])
master = pd.merge(master, totalOuts, on=['Year','Team'])
master['teamERA'] = np.round(9*master['earnedRunsAllowed']  / (master['totalOuts']/3),decimals=2)
master = pd.merge(master, totalWalksAllowed, on=['Year','Team'])
master['WHIP'] = np.round((master['totalWalksAllowed']+master['hitsAgainst'])/(master['totalOuts']/3),decimals=2)
#master['teamBattingAverage'] = np.round(master['hitsFor']/master['totalOuts'])
master = pd.merge(master, totalExtraBaseHits, on=['Year','Team'])
master = pd.merge(master, atBats, on=['Year','Team'])
master = pd.merge(master, totalWalksEarned, on=['Year','Team'])
master['battingAverage'] = np.round(master['hitsFor'] / (master['atBats']-master['totalWalksEarned']),decimals=3)
master.head(5)


Unnamed: 0,Year,Team,Wins,Losses,runsFor,runsAgainst,hitsFor,hitsAgainst,avgPitchersUsed,earnedRunsAllowed,totalOuts,teamERA,totalWalksAllowed,WHIP,totalExtraBaseHits,atBats,totalWalksEarned,battingAverage
0,2013,ANA,9,16,105,129,237,248,4.61,90,708,3.43,101,1.48,73,902,72,0.286
1,2013,ARI,15,10,110,91,220,222,4.25,95,709,3.62,72,1.24,79,880,78,0.274
2,2013,ATL,16,9,106,82,202,209,3.83,102,667,4.13,64,1.23,75,830,85,0.271
3,2013,BAL,15,10,129,104,229,191,3.93,117,672,4.7,92,1.26,81,860,79,0.293
4,2013,BOS,18,7,128,88,230,182,3.86,117,669,4.72,94,1.24,89,846,92,0.305


In [14]:
#Bring in playoffs and world series dataset.
postSeasonHist = pd.read_csv('Playoffs&WS.csv')

In [15]:
postSeasonHist

Unnamed: 0,Year,Team,Playoff Indicator,World Series Indicator
0,2017,ANA,0,0
1,2017,ARI,1,0
2,2017,ATL,0,0
3,2017,BAL,0,0
4,2017,BOS,1,0
5,2017,CHA,0,0
6,2017,CHN,1,0
7,2017,CIN,0,0
8,2017,CLE,1,0
9,2017,COL,1,0


In [16]:
train = master[['Year','Team','Wins','Losses','runsFor','runsAgainst','hitsFor','hitsAgainst','avgPitchersUsed',
                      'teamERA','WHIP','totalExtraBaseHits','battingAverage']]
#train.head(5)

In [17]:
train['Year'] = train['Year'].astype(str).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
train = pd.merge(train, postSeasonHist, on=['Year','Team'])
train.head(5)

Unnamed: 0,Year,Team,Wins,Losses,runsFor,runsAgainst,hitsFor,hitsAgainst,avgPitchersUsed,teamERA,WHIP,totalExtraBaseHits,battingAverage,Playoff Indicator,World Series Indicator
0,2013,ANA,9,16,105,129,237,248,4.61,3.43,1.48,73,0.286,0,0
1,2013,ARI,15,10,110,91,220,222,4.25,3.62,1.24,79,0.274,0,0
2,2013,ATL,16,9,106,82,202,209,3.83,4.13,1.23,75,0.271,1,0
3,2013,BAL,15,10,129,104,229,191,3.93,4.7,1.26,81,0.293,0,0
4,2013,BOS,18,7,128,88,230,182,3.86,4.72,1.24,89,0.305,1,1


In [19]:
#testData - Beginning of season through April, 2018.
#testData source: 
#http://mlb.mlb.com/stats
#http://mlb.mlb.com/stats/sortable.jsp#elem=%5Bobject+Object%5D&tab_level=child&click_text=Sortable+Player+hitting&game_type='R'&season=2018&season_type=ANY&league_code='MLB'&sectionType=sp&statType=hitting&page=1&ts=1525201729915&playerType=ALL&timeframe=&last_x_days=&split=4
testData = pd.read_csv('testDataFinal.csv')
testData['Playoff Indicator'] = 0
testData['World Series Indicator'] = 0
#testData.head(5)
testDataPlayoffTarget = testData['Playoff Indicator']
testDataWorldSeriesTarget = testData['World Series Indicator']
testDataModel = testData.iloc[:,3:13]
#testDataModel.head(5)

In [20]:
testData.head(5)
#testDataModel.head(5)

Unnamed: 0,Year,Team,Team Name,Wins,Losses,runsFor,runsAgainst,hitsFor,hitsAgainst,teamERA,WHIP,totalExtraBaseHits,battingAverage,Playoff Indicator,World Series Indicator
0,2018,ARI,Arizona Diamondbacks,18,7,114,78,194,178,2.88,1.09,89,0.232,0,0
1,2018,ATL,Atlanta Braves,14,10,124,101,223,187,3.67,1.4,88,0.261,0,0
2,2018,BAL,Baltimore Orioles,7,19,93,144,207,259,5.04,1.5,77,0.232,0,0
3,2018,BOS,Boston Red Sox,19,6,151,88,248,202,3.4,1.18,97,0.278,0,0
4,2018,CHN,Chicago Cubs,14,9,114,85,200,168,3.43,1.3,71,0.257,0,0


In [21]:
#Dropping avgPitchersUsed - unable to find that data point for current year & specific time frame.
train = train.drop(['avgPitchersUsed'], axis=1)

In [22]:
mlbLabelMap = {
            'ANA':1,
            'ARI':2,
            'ATL':3,
            'BAL':4,
            'BOS':5,
            'CHA':6,
            'CHN':7,
            'CIN':8,
            'CLE':9,
            'COL':10,
            'DET':11,
            'HOU':12,
            'KCA':13,
            'LAN':14,
            'MIA':15,
            'MIL':16,
            'MIN':17,
            'NYA':18,
            'NYN':19,
            'OAK':20,
            'PHI':21,
            'PIT':22,
            'SDN':23,
            'SEA':24,
            'SFN':25,
            'SLN':26,
            'TBA':27,
            'TEX':28,
            'TOR':29,
            'WAS':30
                }
train['teamNumber'] = train['Team'].map(mlbLabelMap)

In [27]:
#Playoff Model
#Decision Tree

X = train.iloc[:,2:12]
yPlayoff = np.array(train.iloc[:,12:13])

#np.array

yPlayoff = np.reshape(yPlayoff,150,)
yWorldSeries = train.iloc[:,13:14]

X_train, X_test, y_train, y_test = train_test_split(X,yPlayoff, random_state=0)
                      
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train,y_train)
pred_tree = tree.predict(X_test)
unique, counts = np.unique(pred_tree, return_counts=True)
print (dict(zip(unique, counts)))
round(tree.score(X_test,y_test),2)

{0: 13, 1: 25}


0.57999999999999996

In [28]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.4)
logreg.fit(X_train,y_train)
pred_logreg = logreg.predict(X_test)
logreg.score(X_test,y_test)

0.63157894736842102

In [29]:
#KNN

#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# Import classification_report and confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Instantiate KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Fit the classifier to the training data
knn.fit(X_train,y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print ("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print ("Classification Report:")
print(classification_report(y_test, y_pred))

print ("Where 0 is predicted to not get into the playoffs, and 1 means predicted to get into the playoffs")
print(knn.score(X_test, y_test))


Confusion Matrix:
[[18  3]
 [14  3]]
Classification Report:
             precision    recall  f1-score   support

          0       0.56      0.86      0.68        21
          1       0.50      0.18      0.26        17

avg / total       0.53      0.55      0.49        38

Where 0 is predicted to not get into the playoffs, and 1 means predicted to get into the playoffs
0.552631578947


In [30]:
#GaussianNB

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# make predictions
y_pred = gnb.predict(X_test)
from sklearn.metrics import accuracy_score
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(y_test, y_pred)*100),2))

Gaussian Naive Bayes accuracy: 71.05


In [31]:
playoffPrediction = gnb.fit(X_train, y_train).predict(testDataModel)

In [34]:
#testDataModel

In [35]:
testDataPredictions = testDataModel
testDataPredictions['Team Name'] = testData['Team Name']
testDataPredictions['Playoff Predictor'] = playoffPrediction

In [36]:
#testDataPredictions
testDataPredictions.loc[testDataPredictions['Playoff Predictor'] == 1]

Unnamed: 0,Wins,Losses,runsFor,runsAgainst,hitsFor,hitsAgainst,teamERA,WHIP,totalExtraBaseHits,battingAverage,Team Name,Playoff Predictor
0,18,7,114,78,194,178,2.88,1.09,89,0.232,Arizona Diamondbacks,1
1,14,10,124,101,223,187,3.67,1.4,88,0.261,Atlanta Braves,1
3,19,6,151,88,248,202,3.4,1.18,97,0.278,Boston Red Sox,1
4,14,9,114,85,200,168,3.43,1.3,71,0.257,Chicago Cubs,1
7,14,11,95,96,199,191,3.58,1.09,74,0.231,Cleveland Indians,1
10,18,9,135,73,231,175,2.47,0.99,75,0.256,Houston Astros,1
18,16,9,151,110,222,198,3.76,1.25,93,0.258,New York Yankees,1
20,15,10,118,90,192,191,3.38,1.19,65,0.234,Philadelphia Phillies,1
21,16,12,129,112,240,230,3.83,1.26,81,0.255,Pittsburgh Pirates,1
24,15,10,121,123,219,221,5.01,1.35,89,0.256,Seattle Mariners,1


In [38]:
#GaussianNB for World Series

X_trainWS, X_testWS, y_trainWS, y_testWS = train_test_split(X,yWorldSeries, random_state=0)


gnb = GaussianNB()
gnb.fit(X_trainWS, y_trainWS)
# make predictions
y_predWS = gnb.predict(X_testWS)
from sklearn.metrics import accuracy_score
print("Gaussian Naive Bayes accuracy:", round((accuracy_score(y_testWS, y_predWS)*100),2))

Gaussian Naive Bayes accuracy: 92.11


  y = column_or_1d(y, warn=True)


In [43]:
WSPrediction = gnb.fit(X_trainWS, y_trainWS).predict(testDataModel.iloc[:,0:10])

  y = column_or_1d(y, warn=True)


In [44]:
WSPrediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [45]:
#Logistic Regression WS

logreg = LogisticRegression(C=0.4)
logreg.fit(X_trainWS,y_trainWS)
pred_logreg = logreg.predict(X_testWS)
logreg.score(X_testWS,y_testWS)

  y = column_or_1d(y, warn=True)


0.97368421052631582

In [46]:
WSPredictionLogReg = logreg.fit(X_trainWS, y_trainWS).predict(testDataModel.iloc[:,0:10])

  y = column_or_1d(y, warn=True)


In [47]:
WSPredictionLogReg

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [48]:
testDataPredictions['WS Predictor'] = WSPredictionLogReg

In [49]:
#testDataPredictions
testDataPredictions.loc[testDataPredictions['WS Predictor'] == 1]

Unnamed: 0,Wins,Losses,runsFor,runsAgainst,hitsFor,hitsAgainst,teamERA,WHIP,totalExtraBaseHits,battingAverage,Team Name,Playoff Predictor,WS Predictor
10,18,9,135,73,231,175,2.47,0.99,75,0.256,Houston Astros,1,1


In [None]:
#######################################################################


#The remaining cells are further testing of the World Series Model and are not in my presentation.
#I was able to see different results in accuracy but wasn't able to predict
#the winner of the world series with any model besides Logistic Regression.


#######################################################################

In [53]:
#World Series Model
#Decision Tree

X = train.iloc[:,2:12]
#yPlayoff = np.array(train.iloc[:,12:13])
yWorldSeries = train.iloc[:,13:14]

yWorldSeries = np.reshape(yWorldSeries,150,)

#yPlayoff = np.reshape(yPlayoff,150,)


X_train, X_test, y_train, y_test = train_test_split(X,yWorldSeries, random_state=0)
                      
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train,y_train)
pred_tree = tree.predict(X_test)
unique, counts = np.unique(pred_tree, return_counts=True)
print (dict(zip(unique, counts)))
round(tree.score(X_test,y_test),2)

{0: 32, 1: 6}


0.81999999999999995

In [54]:
WSPredictionTree = tree.fit(X_train, y_train).predict(testDataModel.iloc[:,0:10])

In [55]:
WSPredictionTree

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [56]:
#KNN World Series

#Import KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# Import classification_report and confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Instantiate KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Fit the classifier to the training data
knn.fit(X_train,y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print ("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print ("Classification Report:")
print(classification_report(y_test, y_pred))

print ("Where 0 is predicted not to win the World Series, and 1 means winning the world series")
print(knn.score(X_test, y_test))


Confusion Matrix:
[[37  0]
 [ 1  0]]
Classification Report:
             precision    recall  f1-score   support

          0       0.97      1.00      0.99        37
          1       0.00      0.00      0.00         1

avg / total       0.95      0.97      0.96        38

Where 0 is predicted not to win the World Series, and 1 means winning the world series
0.973684210526


  'precision', 'predicted', average, warn_for)


In [57]:
WSPredictionKNN = knn.fit(X_train, y_train).predict(testDataModel.iloc[:,0:10])

  if __name__ == '__main__':


In [58]:
WSPredictionKNN

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0], dtype=int64)