In [58]:
# This is an example of a machine learning project
# The goal is to predict the state someone was born based on their career passing yards
# There is probably no correlation here whatsoever, so the model will likely not be 
# very accurate. But this goes to show you can build a model for whatever you want and
# see if it works

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [59]:
# Read into dataframes "Basic_Stats.csv" and "Career_Stats_Passing.csv"
basic_stats = pd.read_csv("./data/Basic_Stats.csv")
passing_stats = pd.read_csv("./data/Career_Stats_Passing.csv")

# Merge the two dataframes into one dataframe
merged = pd.merge(basic_stats, passing_stats, on="Player Id")

In [60]:
# Split "Birth Place" column into "City" and "State" columns
merged[['BirthCity', 'BirthState']] = merged['Birth Place'].str.split(' , ', expand=True)

# Drop "Birth Place" column
merged.drop('Birth Place', axis=1, inplace=True)

# List of US States
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
            'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
            'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
            'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
            'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
            'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
            'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
states_abbrev = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
            'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
            'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
            'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
            'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# For BirthState col, replace state name with state abbreviation
# This is because sometimes lists New York, sometimes NY, so make it consisten
merged['BirthState'] = merged['BirthState'].replace(states, states_abbrev)

In [61]:
# Some data cleanup and preprocessing

# For passing yards, replace "--" with 0
merged['Passing Yards'] = merged['Passing Yards'].replace('--', 0)

# Remove non numeric chars from passing yards
merged['Passing Yards'] = merged['Passing Yards'].str.replace(',', '')
merged['Passing Yards'] = merged['Passing Yards'].str.replace('"', '')

merged['Passing Yards'] = merged['Passing Yards'].fillna(0)

# Convert passing yards col to int
merged['Passing Yards'] = merged['Passing Yards'].astype(int)

In [62]:
# Aggreate total number of passing yards per player id
merged.groupby(['Player Id'], as_index=False).agg({'Passing Yards': 'sum', 'BirthState': 'first'})

print(merged.keys())
print(merged['BirthState'])
print(merged['Passing Yards'])

# Drop any remaining NA cols from birthstate and passing yards
merged = merged.dropna(subset=['BirthState', 'Passing Yards'])

# Dictionary assigning US state abbrev to int
state_dict = {'AL': 1, 'AK': 2, 'AZ': 3, 'AR': 4, 'CA': 5, 'CO': 6, 'CT': 7, 'DE': 8, 'FL': 9, 'GA': 10,
            'HI': 11, 'ID': 12, 'IL': 13, 'IN': 14, 'IA': 15, 'KS': 16, 'KY': 17, 'LA': 18, 'ME': 19, 'MD': 20,
            'MA': 21, 'MI': 22, 'MN': 23, 'MS': 24, 'MO': 25, 'MT': 26, 'NE': 27, 'NV': 28, 'NH': 29, 'NJ': 30,
            'NM': 31, 'NY': 32, 'NC': 33, 'ND': 34, 'OH': 35, 'OK': 36, 'OR': 37, 'PA': 38, 'RI': 39, 'SC': 40,
            'SD': 41, 'TN': 42, 'TX': 43, 'UT': 44, 'VT': 45, 'VA': 46, 'WA': 47, 'WV': 48, 'WI': 49, 'WY': 50}

# Add column to merged df with int value of state
# ML models only work on numerical data, so we need to convert state to int
# There are lots of ways to do this, like Word2Vec, but for simplicity we will use a dictionary
merged['StateInt'] = merged['BirthState'].map(state_dict)
#dropna from stateint
merged = merged.dropna(subset=['StateInt'])

Index(['Age', 'Birthday', 'College', 'Current Status', 'Current Team',
       'Experience', 'Height (inches)', 'High School', 'High School Location',
       'Name_x', 'Number', 'Player Id', 'Position_x', 'Weight (lbs)',
       'Years Played', 'Name_y', 'Position_y', 'Year', 'Team', 'Games Played',
       'Passes Attempted', 'Passes Completed', 'Completion Percentage',
       'Pass Attempts Per Game', 'Passing Yards', 'Passing Yards Per Attempt',
       'Passing Yards Per Game', 'TD Passes', 'Percentage of TDs per Attempts',
       'Ints', 'Int Rate', 'Longest Pass', 'Passes Longer than 20 Yards',
       'Passes Longer than 40 Yards', 'Sacks', 'Sacked Yards Lost',
       'Passer Rating', 'BirthCity', 'BirthState'],
      dtype='object')
0       WY
1       WY
2       WY
3       IA
4       IA
        ..
8520    NC
8521    NC
8522    NC
8523    NC
8524    NC
Name: BirthState, Length: 8525, dtype: object
0         0
1         0
2       108
3         0
4       159
       ... 
8520      0
852

In [63]:
# Given number of passing yards, ml model to predict their birth state
# X = passing yards, y = birth state
X = merged["Passing Yards"].values.reshape(-1, 1)
y = merged["StateInt"].values.reshape(-1, 1)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Predicting what state someone is from is a multi-class classification task
# Popular ML models for this are:
# k-Nearest Neighbors.
# Decision Trees.
# Naive Bayes.
# Random Forest.
# Gradient Boosting.

# Perform 5 fold cross validation on each model
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Ignore warnings
import warnings
warnings.filterwarnings('ignore') 

models = [KNeighborsClassifier(n_neighbors=3), DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(), GradientBoostingClassifier()]

for model in models:
    scores = cross_val_score(model, X, y, cv=5)
    print(model)
    print(f"Cross Validation Accuracy: {round(scores.mean()*100, 3)}%\n")


KNeighborsClassifier(n_neighbors=3)
Cross Validation Accuracy: 8.4%

DecisionTreeClassifier()
Cross Validation Accuracy: 10.746%

GaussianNB()
Cross Validation Accuracy: 5.09%

RandomForestClassifier()
Cross Validation Accuracy: 10.602%

GradientBoostingClassifier()
Cross Validation Accuracy: 9.555%



In [64]:
# Are these models any good?
# After all, it seems like there would hardly be any correlation between passing yards and birth state
# Lets build a model where we just randomly guess

# 3 methods:
# 1. Randomly guess a state (2% chance of being right)
# 2. Guess the most common state (chance of being right equal to most popular state)
# 3. Get probabilities of each state, then randomly guess based on those probabilities

In [65]:
# Method 1
# Build model that randomly guesses a state
import random
random.seed(42)

# Get list of all states
states = merged['StateInt'].unique()

# Get number of rows in test set
num_rows = len(X_test)

# Calculate accuracy of random state model
from sklearn.metrics import accuracy_score

# Avg score from 5 random tests
random_scores = []
for _ in range(5):
    random_states = [random.choice(states) for _ in range(num_rows)]
    random_scores.append(accuracy_score(y_test, random_states))

print(f"Random State Model Average Accuracy: {round(np.mean(random_scores)*100, 3)}%\n")

Random State Model Average Accuracy: 2.021%



In [73]:
# Method 2

# Get percent of people from each state
state_counts = merged['BirthState'].value_counts(normalize=True)
print(state_counts.head(1))

# So guessing CA for everything would give 12.948% accuracy
# ML models may learn this, so you have to do a lot of testing as well as 
# understanding the data to see if the model is actually learning anything

print(f"\nMost Common State Model Accuracy: {round(state_counts[0]*100, 3)}%\n")

CA    0.129483
Name: BirthState, dtype: float64

Most Common State Model Accuracy: 12.948%



In [67]:
# Method 3
# Use those probabilities to randomly guess a state
# This is called a multinomial distribution

# Get probabilities of each state
state_probs = merged['BirthState'].value_counts(normalize=True).values

# Get list of all states
states = merged['StateInt'].unique()

# Get number of rows in test set
num_rows = len(X_test)

# Calculate accuracy of random state model
from sklearn.metrics import accuracy_score

# Avg score from 5 random tests
random_scores = []
for _ in range(5):
    random_states = [random.choices(states, weights=state_probs)[0] for _ in range(num_rows)]
    random_scores.append(accuracy_score(y_test, random_states))

print(f"Random State Model Average Accuracy: {round(np.mean(random_scores)*100, 3)}%\n")

Random State Model Average Accuracy: 3.109%

