Connect to Google Drive

In [2]:
from google.colab import drive
import pandas as pd

# Mount to Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


Load final_df into dataframe

In [9]:
# Provide full path to allstar CSV file in Google Drive
csv_final_df = '/content/drive/MyDrive/COMP 333/NBA-Data/final_df.csv'

# Load the CSV file into a DataFrame
final_df = pd.read_csv(csv_final_df)

# Print out the schema
print("DataFrame Schema:")
display(final_df.info())

DataFrame Schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1485 entries, 0 to 1484
Data columns (total 32 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    1485 non-null   object 
 1   Rk        1485 non-null   int64  
 2   Pos       1485 non-null   object 
 3   Age       1485 non-null   int64  
 4   Tm        1485 non-null   object 
 5   G         1485 non-null   int64  
 6   GS        1485 non-null   int64  
 7   MP        1485 non-null   float64
 8   FG        1485 non-null   float64
 9   FGA       1485 non-null   float64
 10  FG%       1485 non-null   float64
 11  3P        1485 non-null   float64
 12  3PA       1485 non-null   float64
 13  3P%       1485 non-null   float64
 14  2P        1485 non-null   float64
 15  2PA       1485 non-null   float64
 16  2P%       1485 non-null   float64
 17  eFG%      1485 non-null   float64
 18  FT        1485 non-null   float64
 19  FTA       1485 non-null   float64
 20  FT%       14

None

Reference visited to decide which models to consider:
https://www.mathworks.com/campaigns/offers/next/choosing-the-best-machine-learning-classification-model-and-avoiding-overfitting.html

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Drop features that should not be part of the prediction model
updated_final_df = final_df.drop(['Player', 'Year'], axis=1)

# Train-Test Split for training dataset
AllStar = updated_final_df['All-Star']
withoutAllStars = updated_final_df.drop(['All-Star'], axis=1)

# We need to encode features that are not numerical
# Categorical features
categorical_features = ['Tm', 'Rk', 'Pos']

# Encoding categorical features
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(withoutAllStars[categorical_features])
encoded_feature_names = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Concatenating encoded categorical features with numerical features
numerical_features = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%','eFG%', 'FT', 'FTA', 'FT%', 'ORB',  'DRB', 'TRB', 'AST', 'STL',  'BLK', 'TOV', 'PF', 'PTS']
X = pd.concat([encoded_df, withoutAllStars[numerical_features]], axis=1)

# Standardize the numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, AllStar, test_size=0.2, random_state=42)

# Logistic Regression Model
logisticModel = LogisticRegression(max_iter=10000)
logisticModel.fit(X_train, y_train)

# Gaussian Naive Bayes Model
gaussianNB = GaussianNB()
gaussianNB.fit(X_train, y_train)

# k-Nearest Neighours Model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Random Forest Classifier Model
randomForestModel = RandomForestClassifier()
randomForestModel.fit(X_train, y_train)

# Predict the values of All Star Feature
predict_logistic = logisticModel.predict(X_test)
predict_naive_bayes = gaussianNB.predict(X_test)
predict_knn = knn.predict(X_test)
predict_random_forest = randomForestModel.predict(X_test)

# Evaluate the models
print("Logistic Regression Accuracy:", accuracy_score(y_test, predict_logistic))
print("Gaussian Naive Bayes Accuracy:", accuracy_score(y_test, predict_naive_bayes))
print("k-Nearest Neighours Accuracy:", accuracy_score(y_test, predict_knn))
print("Random Forest Accuracy:", accuracy_score(y_test, predict_random_forest))

# Print predictions
print(predict_logistic)
print(predict_naive_bayes)
print(predict_knn)
print(predict_random_forest)

Logistic Regression Accuracy: 0.9696969696969697
Gaussian Naive Bayes Accuracy: 0.6531986531986532
k-Nearest Neighours Accuracy: 0.9663299663299664
Random Forest Accuracy: 0.9494949494949495
[ True False False False False False False False False  True  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True False  True False  True False False False False False
 False False False False False False False False False False False False
  True  True  True False False False False False False  True False False
 False False False False False False False False False False  True False
 False False False False False False  True  True False False False False
 False

From the accuracy scores above, we would use the Logistic Regression model to predict All Star players.