## Import 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Pull the DataFrame 

In [3]:
df = pd.read_csv('data/pgaTourData.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/pgaTourData.csv'

## Description of the Data

- **Player Name**: Name of the golfer
- **Rounds**: The number of games that a player played
- **Fairway Percentage**: The percentage of time a tee shot lands on the fairway
- **Year**: The year in which the statistic was collected
- **Avg Distance**: The average distance of the tee-shot
- **gir**: (Green in Regulation) is met if any part of the ball is touching the putting surface while the number of strokes taken is at least two fewer than par
- **Average Putts**: The average number of strokes taken on the green
- **Average Scrambling**: Scrambling is when a player misses the green in regulation, but still makes par or better on a hole
- **Average Score**: Average Score is the average of all the scores a player has played in that year
- **Points**: The number of FedExCup points a player earned in that year. These points can be earned by competing in tournaments.
- **Wins**: The number of competition a player has won in that year
- **Top 10**: The number of competitions where a player has placed in the Top 10
- **Average SG Putts**: Strokes gained: putting measures how many strokes a player gains (or loses) on the greens.
- **Average SG Total**: The Off-the-tee + approach-the-green + around-the-green + putting statistics combined
- **SG:OTT**: Strokes gained: off-the-tee measures player performance off the tee on all par-4s and par-5s.
- **SG:APR**: Strokes gained: approach-the-green measures player performance on approach shots. Approach shots include all shots that are not from the tee on par-4 and par-5 holes and are not included in strokes gained: around-the-green and strokes gained: putting. Approach shots include tee shots on par-3s.
- **SG:ARG**: Strokes gained: around-the-green measures player performance on any shot within 30 yards of the edge of the green. This statistic does not include any shots taken on the putting green.
- **Money**: The amount of prize money a player has earned from tournaments.

In [None]:
# View the data info
df.info()

In [None]:
# Find any null values in the dataset
df.isna().sum()

In [None]:
# Remove all commas and $ from 'Money' column and then assign to float dtype
df['Money'] = df['Money'].str.replace(',', '').str.replace('$', '').astype(float)
df

In [None]:
# Replace NaN with 0 in Top 10
df['Top 10'].fillna(0, inplace=True)
df['Top 10'] = df['Top 10'].astype(int)
df

In [None]:
# Replace NaN with 0 in # of wins
df['Wins'].fillna(0, inplace=True)
df['Wins'] = df['Wins'].astype(int)
df

In [None]:
# Drop NaN values
df.dropna(axis = 0, inplace=True)

# Change Rounds to int
df['Rounds'] = df['Rounds'].astype(int)
df

In [None]:
# Change Points to int
df['Points'] = df['Points'].apply(lambda x: x.replace(',',''))
df['Points'] = df['Points'].astype(int)
df

In [None]:
# Insert underscores to column names where needed and lowercase all columns
df.columns=df.columns.str.replace(' ', '_').str.lower()

# Insert underscores to all players under the 'player_name' column.
df["player_name"] = df["player_name"].str.replace(' ', '_')
df

In [None]:
# Import Seaborn and look at correlations for dataset
import seaborn as sns
corr = df.corr()
sns.heatmap(corr)

In [None]:
# Create 'winners' column - anyone with wins gets a 1; anyone without wins gets a 0.
df['winners'] = df['wins'].apply(lambda x: 1 if x > 0 else 0)
df

In [None]:
# Strongest correlation with 'winners'
df.corr()['winners'].sort_values(ascending=False).abs()

## Bar Plots, Strip Plot, and Box Plot for relationship between 'year' and 'points'. And 'winners' is the hue.

In [None]:
sns.stripplot(y = df['points'], x = df['year'], hue = df['winners'])
plt.legend(bbox_to_anchor=(1, 1), loc=2)

In [None]:
box = sns.boxplot(x = 'year', y = 'points', hue = 'winners', data = df);

In [None]:
sns.set_style('darkgrid')
sns.barplot(data=df, x="year", y="points", hue='winners')

In [None]:
sns.barplot(x = df['winners'], y = df['points'])

In [None]:
sns.countplot(x = df['winners'])

## Bar Plots, Strip Plot, and Box Plot for relationship between 'year' and 'money'. And 'winners' is the hue.

In [None]:
sns.stripplot(y = df['money'], x = df['year'], hue = df['winners'])
plt.legend(bbox_to_anchor=(1, 1), loc=2)

In [None]:
box2 = sns.boxplot(x = 'year', y = 'money', hue = 'winners', data = df);

In [None]:
sns.set_style('darkgrid')
sns.barplot(data=df, x="year", y="money", hue='winners')

In [None]:
sns.barplot(x = df['winners'], y = df['money'])

In [None]:
sns.countplot(x = df['winners'])

## Create Models for 'winners'

In [None]:
# Import tools you'll need for models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix

## Data Preparation

- Dropped 'wins' column because we used it to create 'winners' column.
- Dropped 'points' column because Cameron will create models based on points.
- Dropped 'money' column because Elliott will create models based on money.
- Dropped 'player_name' column because it's a string.

In [None]:
# Target
y = df['winners']
# Features
X = df.drop(['winners', 'wins', 'points', 'player_name'], axis=1)

### Train-Test-Split (Testing Data, Training Data, Validation Data)

In [None]:
# Initial Train-Test Split for Testing Data
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=.25, random_state=2021)

In [None]:
# Train-Test-Split for Training Data and Validation Data
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=.25, random_state=2021)

### Check null values for training data and validation data

In [None]:
# Check again for null values
X_train.isna().sum().sum()

In [None]:
# Check again for null values
X_val.isna().sum().sum()

### Pull all numerical columns from dataset

In [None]:
# Select only numerical columns
num_cols = list(X.select_dtypes('number').columns)
num_cols

## Scale the dataset so the data's distribution will even out.

In [None]:
# Instantiate and Fit StandardScaler
scaler = StandardScaler()
scaler.fit(X_train[num_cols])

# Transform and Convert to DataFrame
X_train_scaled = pd.DataFrame(scaler.transform(X_train[num_cols]), columns=num_cols)
X_val_scaled = pd.DataFrame(scaler.transform(X_val[num_cols]), columns=num_cols)

In [None]:
X_train_scaled.head()

In [None]:
X_val_scaled.head()

## First Model - Logistic Regression

In [None]:
# Instantiate LogisticRegression
logreg = LogisticRegression(solver='liblinear')

### Create a function you can use to run all your models for the dataset

In [None]:
def modeling_function(model, X_train, y_train, X_val, y_val):

    # fit model on training data
    model.fit(X_train, y_train)

    # make predictions on training and validation data
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    # Print accuracy score
    print('Training accuracy: ', accuracy_score(y_train, train_preds))
    print('Validation accuracy: ', accuracy_score(y_val, val_preds))

    # return fitted model
    return model

### Find the Training and Validaiton Accuracy for this model and plot the Confusion Matrix

In [None]:
# Call modeling function
logreg = modeling_function(logreg, X_train_scaled, y_train, X_val_scaled, y_val)

In [None]:
# Plot the Confusion Matrix
plot_confusion_matrix(logreg, X_train_scaled, y_train)

#### We need to check for class imbalance with the data.

In [None]:
y_train.value_counts()

### Use SMOTE to resolve class imbalance

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2021)

In [None]:
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)
y_train_res.value_counts()

## Second Model - Logistic Regression with SMOTE applied

In [None]:
# Instantiate second LogisticRegression
logreg2 = LogisticRegression(solver='liblinear')
logreg2.fit(X_train_res, y_train_res)

### Second model output vs. first model output

In [None]:
# Call second modeling function
logreg2 = modeling_function(logreg2, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call modeling function
logreg = modeling_function(logreg, X_train_scaled, y_train, X_val_scaled, y_val)

#### The second model has IMPROVED accuracy after resolving class imbalance. The second model is now our best performing model.

### Plot the Confusion Matrix for the second model.

In [None]:
# Plot the Confusion Matrix
plot_confusion_matrix(logreg2, X_train_res, y_train_res)

## Third Model - Logistic Regression with SMOTE, penalty = 'l1' (Lasso) and C = 0.2

In [None]:
# Instantiate third LogisticRegression
logreg3 = LogisticRegression(solver='liblinear', penalty='l1', C = 0.2)

### Third model output vs. second model output

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call second modeling function
logreg2 = modeling_function(logreg2, X_train_res, y_train_res, X_val_scaled, y_val)

#### The third model has improved accuracy even more after changing the model to Lasso Regression and setting C = 0.2. The third model is now our best performing model.

### Plot the Confusion Matrix for the third model.

In [None]:
# Plot the Confusion Matrixb
plot_confusion_matrix(logreg3, X_train_res, y_train_res)

## Fourth Model - DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(random_state=2021)

### Fourth model output vs. third model output

In [None]:
# Call fourth modeling function
dt = modeling_function(dt, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

#### The fourth model has excellent training accuracy, but it's significantly further away from validation accuracy. This model is more likely to suffer from overfitting and the third model is still our best performing model.

### Let's set the max_depth = 8 and see if our model has a better performance.

In [None]:
dt2 = DecisionTreeClassifier(random_state=2021, max_depth=8)

In [None]:
dt2 = modeling_function(dt2, X_train_res, y_train_res, X_val_scaled, y_val)

#### The training accuracy is closer to the validation accuracy now, but they're still significantly further away from each other. The third model is still our best performing model.

## Fifth Model - RandomForestClassifier

In [None]:
rf = RandomForestClassifier(random_state=2021)

### Fifth model output vs. third model output

In [None]:
# Call fifth modeling function
rf = modeling_function(rf, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

#### The training accuracy is excellent, but it's significantly further away from validation accuracy. However, it performs better than the first decision tree model. The third model is still our best performing model.

I am going to try a GridSearch to experiment with some hyperparameters. I want to prevent overfitting so I am going to adjust `n_estimators`, `criterion`, `max_depth` and `min_samples_leaf`

In [None]:
# Create Param Grid 
param_grid = {'n_estimators': [50, 75, 100],
              'criterion': ['gini', 'entropy'],
              'max_depth': [1, 3, 5, 7, 9, 10],
              'min_samples_leaf': [1, 3, 5, 7]}

In [None]:
# Create GridSearchCV
rf_gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy')

In [None]:
rf_gs.fit(X_train_res, y_train_res)

Now I can make predictions using this best estimator and see how it performs compared to previous models.

In [None]:
best_model = rf_gs.best_estimator_

In [None]:
rf_gs.best_params_

### GridSearchCV model output vs. third model output

In [None]:
# Call GridSearchCV modeling function
modeling_function(best_model, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

#### The GridSearchCV model has little changes to training accuracy and no change to validation accuracy. The third model is still our best performing model.

## Sixth Model - KNeighborsClassifier `k=9`

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=9)

### Sixth model output vs. third model output

In [None]:
# Call sixth modeling function
knn = modeling_function(knn, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

#### The sixth model's training accuracy is closer to the third model's training accuracy. However, validation accuracy is much lower than before and is our worst performing model. The third model is still our best performing model.

## Seventh Model - Support Vector Machine with `kernel = 'linear'` and `C = 0.1`

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=0.1)

In [None]:
# Call seventh modeling function
svm = modeling_function(svm, X_train_res, y_train_res, X_val_scaled, y_val)

In [None]:
# Call third modeling function
logreg3 = modeling_function(logreg3, X_train_res, y_train_res, X_val_scaled, y_val)

#### With our seventh model, we find both our training accuracy and validation accuracy to be 0.894. This model performs slightly better than the third model as shown above. Now our seventh model is our best performing model.

### Plot the Confusion Matrix for the seventh model

In [None]:
# Plot the Confusion Matrix
plot_confusion_matrix(svm, X_train_res, y_train_res)

## Testing Data with our best model - the Support Vector Machine model

In [None]:
# Transform SCALED test data and convert to DataFrame
X_test_scaled = pd.DataFrame(scaler.transform(X_test[num_cols]), columns=num_cols)

In [None]:
X_test_scaled.shape

In [None]:
# Get the predicted values for testing data
test_preds = svm.predict(X_test_scaled)

In [None]:
# Accuracy score for testing data
print('Testing accuracy: ', accuracy_score(y_test, test_preds))

In [None]:
# Call seventh modeling function
svm = modeling_function(svm, X_train_res, y_train_res, X_val_scaled, y_val)

#### Based on the output above, we can see that our accuracies for testing, training, and validation are all 0.894.