In [1]:
import pandas as pd
import os
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns



In [2]:
# Loads the player data into a pandas dataframe
player_data = os.path.join('data','edited_stats.csv')
df = pd.read_csv(player_data)
print(len(df))

17100


In [3]:
df.columns

Index(['Year', 'Player', 'Age', 'G', 'MP', 'PER', 'TS%', 'WS', 'VORP', 'FG%',
       'FT%', 'TRB', 'AST', 'STL', 'BLK', 'PTS', 'PPG', 'RPG', 'APG', 'SPG',
       'BPG', 'MVP'],
      dtype='object')

In [4]:
#Prints the number of MVPs and non MVPs
df['MVP'].value_counts()

0    17067
1       33
Name: MVP, dtype: int64

In [7]:
# Removes all the NaN rows and columns
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)

#displays data only after 2000
df = df[df['Year'] >= 2000] 
y = df['MVP'] #Stores MVP list in this dataframe

# the x dataframe only contains the statistics to create the model
x = df.drop(['Player'], axis=1)
x = x.drop(['MVP'], axis=1)
x = x.drop(['Year'], axis=1)

print(len(x))
x.head(10)

9758


Unnamed: 0,Age,G,MP,PER,TS%,WS,VORP,FG%,FT%,TRB,AST,STL,BLK,PTS,PPG,RPG,APG,SPG,BPG
6896,25,61,1578,13.6,0.477,2.2,0.2,0.424,0.756,291,98,59,28,697,11.42623,4.770492,1.606557,0.967213,0.459016
6897,25,46,1205,14.4,0.484,2.0,0.4,0.433,0.762,239,72,53,16,563,12.23913,5.195652,1.565217,1.152174,0.347826
6898,25,15,373,10.8,0.448,0.2,-0.2,0.389,0.738,52,26,6,12,134,8.933333,3.466667,1.733333,0.4,0.8
6899,23,82,3223,20.2,0.547,8.8,3.5,0.465,0.809,825,271,89,87,1663,20.280488,10.060976,3.304878,1.085366,1.060976
6900,26,29,329,8.8,0.381,-0.1,-0.1,0.286,0.773,42,58,24,2,82,2.827586,1.448276,2.0,0.827586,0.068966
6901,24,82,3070,20.6,0.57,10.1,3.9,0.455,0.887,359,308,110,19,1809,22.060976,4.378049,3.756098,1.341463,0.231707
6902,23,27,361,4.3,0.31,-0.7,-0.7,0.284,0.75,23,70,12,0,60,2.222222,0.851852,2.592593,0.444444,0.0
6903,29,80,1684,13.2,0.505,2.4,-0.7,0.437,0.766,266,95,35,37,836,10.45,3.325,1.1875,0.4375,0.4625
6904,25,64,2201,16.9,0.542,3.3,1.0,0.438,0.877,258,220,90,11,1080,16.875,4.03125,3.4375,1.40625,0.171875
6905,29,82,2593,17.4,0.524,7.3,2.1,0.44,0.775,225,420,139,8,1149,14.012195,2.743902,5.121951,1.695122,0.097561


In [8]:
# Sets the testing and training set for the model
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, stratify=y)

In [9]:
# Will be using a Logisitic Regression model
model = LogisticRegression ()

In [10]:
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import accuracy_score
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions)) # prints the accuracy of the model

0.9987704918032787


In [44]:
# loads the current data to make the prediction for the year 2018
current_data = os.path.join('data','final_2018_stats.csv')
current_stats_df = pd.read_csv(current_data)

In [48]:
# modifies the data so that it matches up with the training set format
current_stats_df.replace('', np.nan, inplace=True)
current_stats_df.dropna(inplace=True)

players = current_stats_df['Player']

current_stats_df = current_stats_df.drop(['Player'], axis=1)
final_df = current_stats_df.drop(['Year'], axis=1)

print(len(final_df))
final_df.head(20)

606


Unnamed: 0,Age,G,MP,PER,TS%,WS,VORP,FG%,FT%,TRB,AST,STL,BLK,PTS,PPG,RPG,APG,SPG,BPG
0,24,75,1132.5,9.0,0.567,2.2,-0.1,0.395,0.848,112.5,30.0,37.5,7.5,352.5,4.7,1.5,0.4,0.5,0.1
1,27,70,1358.0,8.2,0.525,1.0,-0.1,0.356,0.817,259.0,56.0,35.0,28.0,413.0,5.9,3.7,0.8,0.5,0.4
2,24,76,2485.2,20.6,0.63,9.7,3.3,0.629,0.557,684.0,91.2,91.2,76.0,1056.4,13.9,9.0,1.2,1.2,1.0
3,20,69,1366.2,15.7,0.57,4.2,0.8,0.512,0.721,379.5,103.5,34.5,41.4,476.1,6.9,5.5,1.5,0.5,0.6
4,32,53,683.7,5.8,0.516,0.1,-0.7,0.401,0.846,63.6,31.8,5.3,10.6,180.2,3.4,1.2,0.6,0.1,0.2
5,29,21,48.3,6.0,0.34,0.0,-0.1,0.333,0.333,14.7,2.1,2.1,0.0,12.6,0.6,0.7,0.1,0.1,0.0
6,32,75,2512.5,25.0,0.57,10.9,3.3,0.51,0.837,637.5,150.0,45.0,90.0,1732.5,23.1,8.5,2.0,0.6,1.2
7,19,72,1440.0,17.5,0.636,4.2,0.8,0.589,0.776,388.8,50.4,28.8,86.4,590.4,8.2,5.4,0.7,0.4,1.2
8,25,18,106.2,2.6,0.366,-0.1,-0.1,0.273,0.778,10.8,12.6,3.6,1.8,19.8,1.1,0.6,0.7,0.2,0.1
9,36,22,272.8,8.7,0.514,0.1,-0.2,0.484,0.524,46.2,8.8,11.0,2.2,103.4,4.7,2.1,0.4,0.5,0.1


In [72]:
#current predictions are exported to a csv file only containing the row number and prediction (0 or 1) for the player
current_prediction = model.predict(final_df)
prediction = pd.DataFrame(current_prediction, columns=['MVP']).to_csv('prediction.csv')
prediction_df = pd.read_csv('prediction.csv')

In [84]:
# this function takes the row number from the prediction file and locates the MVP in the current inputted dataset
def find_MVP(df):
    location_MVP = df.loc[df['MVP']==1].index[0]
    player_name = players.iloc[location_MVP]
    print("The MVP from the inputted dataset is:", player_name.split('\\')[0])

In [85]:
# calls the method to print the predicted MVP
find_MVP(prediction_df)

The MVP from the inputted dataset is: James Harden
