Collaborators: Ben DeSollar and Matt McDonell

In [1]:
# imports
import numpy as np
from mlwpy import *
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
from sklearn import (datasets, neighbors,
                     naive_bayes,
                     model_selection as skms,
                     linear_model, dummy,
                     metrics,
                     pipeline,
                     preprocessing as skpre) 
import csv
from sklearn import tree


data_train_df = pd.read_csv("train.csv") 
data_train_ft = data_train_df.drop('PTS', axis=1)
data_train_tgt = data_train_df["PTS"]

Added Explanations

In [2]:
# initial exploration of data
print("data_train_df:")
display(data_train_df.head(10))

# look for columns with missing data
print("Info gives us this:")
data_train_df.info()

# let's focus on only features that seem most useful for now
features = ['HEIGHT',
            'SEASON_EXP',
            'DRAFT_ROUND',
            'AST',
            'REB',
            'ALL_STAR_APPEARANCES',
            # 'Avg_Utilization_Ratio',
            # 'Total_Trans_Ct',
            # 'Total_Ct_Chng_Q4_Q1', 
           # 'Total_Revolving_Bal',
           # 'Total_Amt_Chng_Q4_Q1',
            ]

# add survival column

# titanic_train_df = titanic_train_df[features + ['Survived']]

# for now, let's use a simple approach to estimate the age (and consider revisiting this estimate later)
# when it is missing by using the median from the other samples

# median_age = data_train_df['Customer_Age'].median() # note: by default, this will skip NA/null values
# print(f'Median age: {median_age:.2f}')
# data_train_df['Customer_Age'] = data_train_df['Customer_Age'].fillna(median_age)

# convert sex to a 0/1 category
# data_train_df['Gender'].replace(['M','F'],[0,1],inplace=True)

# display updated table
print("data_train_df:")
display(data_train_df.head(10))
print("data_train_df.info():")
display(data_train_df.info())
# print("describe:")
# display(data_train_df.describe())

# also convert to floating type for scaler
# data_train_df = data_train_df[features].astype(float) 
# titanic_train_tgt = titanic_train_df['Survived']

# use cross-validation to pick best model (use accuracy since 
# kaggle will evaluate on accuracy as well)
models_to_try = {'nb': naive_bayes.GaussianNB()}
# add k-NN models with various values of k to models_to_try
for k in range(1,42,2):
    models_to_try[f'{k}-NN'] = neighbors.KNeighborsClassifier(n_neighbors=k)

# scaler = skpre.StandardScaler()
pipelines_to_try = \
    {'GNB0' : naive_bayes.GaussianNB(),

     # 'SVC(1)' : svm.SVC(kernel="linear"),
     #'SVC(2)' : svm.LinearSVC(),
     #'SVC(3)' : svm.SVC(kernel="poly" ,C=.8),
     #'SVC(4)' : svm.NuSVC(kernel='linear', nu=.2),
     'DTC' : tree.DecisionTreeClassifier(),
     'DTC-5' : tree.DecisionTreeClassifier(max_depth=5),
     'DTC-10' : tree.DecisionTreeClassifier(max_depth=10),
     '5NN-C' : neighbors.KNeighborsClassifier(),
     '10NN-C' : neighbors.KNeighborsClassifier(n_neighbors=10)}

baseline = dummy.DummyClassifier(strategy="uniform")
'''
for model_name in models_to_try:
    pipelines_to_try[f'std_{model_name}_pipe'] = pipeline.make_pipeline(scaler, 
                                                      models_to_try[model_name])
'''

sv_classifiers = {"SVC(Linear)"   : svm.SVC(kernel='linear'),
                  "NuSVC(Linear)" : svm.NuSVC(kernel='linear', nu=.9)} 


accuracy_scores = {}
for name, model in pipelines_to_try.items():
    #loo = skms.LeaveOneOut()s
    scores = skms.cross_val_score(model,
                                  data_train_ft[features],
                                  data_train_tgt,
                                  #cv=loo,
                                  cv=10,
                                  scoring='accuracy')
    mean_accuracy = scores.mean()
    accuracy_scores[name] = mean_accuracy
    print(f'{name}: {mean_accuracy:.3f}')

best_pipeline_name = max(accuracy_scores,key=accuracy_scores.get)
print(f'\nBest pipeline: {best_pipeline_name} (accuracy = {accuracy_scores[best_pipeline_name]:.3f})')
# set variables for test cell
final_pipeline = pipelines_to_try[best_pipeline_name]

# apply final model to test features
# load data

data_test_df = pd.read_csv("test.csv")
# data_test_df.info() # check for additional null values

# median_age = data_test_df['Customer_Age'].median() # note: by default, this will skip NA/null values
# print(f'Median age: {median_age:.2f}')
# data_test_df['Customer_Age'] = data_test_df['Customer_Age'].fillna(median_age)

# convert sex to a 0/1 category
# data_test_df['Gender'].replace(['M','F'],[0,1],inplace=True)

data_test_df = data_test_df[features].astype(float) 

fit = final_pipeline.fit(data_train_ft[features], data_train_tgt)
predictions = fit.predict(data_test_df[features])
'''
def writeSubmission(predictions):
   i=6751
   submissionList = []
   for prediction in predictions:
       submissionList.append([str(i), str(prediction)])
       i+=1
   with open('submission.csv', 'w', newline='') as submission:
       writer = csv.writer(submission)
       writer.writerow(['id', 'Target'])
       for row in submissionList:
           writer.writerow(row)


writeSubmission(predictions)
'''
# This is just using the test.csv to setup a dataframe of the correct size
# and indicies (the "id" field).
make_submission_df = pd.read_csv("test.csv")
# drop all columns except 'id'
make_submission_df = make_submission_df[['id']]
# make sure the column of ID's that we just read in is the index column
make_submission_df = make_submission_df.set_index('id')

# just guess a value from 0 to 5
# probably won't perform very well
predictions = np.random.rand(1350)*5

# Here, you add your predictions to the dataframe
make_submission_df['PTS'] = predictions

# Either one of these will work
# The first one will round all floating point numbers to 2 decimals
# Makes it easier to look at.
make_submission_df.to_csv('submission.csv',sep=',', float_format='%.2f')
#make_submission_df.to_csv('submission.csv',sep=',')

data_train_df:


Unnamed: 0,id,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,...,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,AST,REB,ALL_STAR_APPEARANCES,PIE,PTS
0,2682,David,Vaughn,David Vaughn,"Vaughn, David",D. Vaughn,david-vaughn,1973-03-23T00:00:00,Memphis,USA,...,Y,Y,1995,1,25,0.3,3.1,0.0,,3.94
1,1486,Javaris,Crittenton,Javaris Crittenton,"Crittenton, Javaris",J. Crittenton,javaris-crittenton,1987-12-31T00:00:00,Georgia Tech,USA,...,Y,Y,2007,1,19,1.8,2.4,0.0,,6.34
2,595,Ed,Gray,Ed Gray,"Gray, Ed",E. Gray,ed-gray,1975-09-27T00:00:00,California,USA,...,Y,Y,1997,1,22,0.8,1.2,0.0,,7.24
3,940,Rakeem,Christmas,Rakeem Christmas,"Christmas, Rakeem",R. Christmas,rakeem-christmas,1991-12-01T00:00:00,Syracuse,USA,...,Y,Y,2015,2,36,0.1,1.9,0.0,,3.14
4,645,Stanley,Johnson,Stanley Johnson,"Johnson, Stanley",S. Johnson,stanley-johnson,1996-05-29T00:00:00,Arizona,USA,...,Y,Y,2015,1,8,1.2,2.2,,0.05,3.94
5,1782,Keith,Smith,Keith Smith,"Smith, Keith",K. Smith,keith-smith,1964-03-09T00:00:00,Loyola-Marymount,USA,...,Y,Y,1986,2,45,1.0,0.8,0.0,,4.34
6,944,Kevin,Love,Kevin Love,"Love, Kevin",K. Love,kevin-love,1988-09-07T00:00:00,UCLA,USA,...,Y,Y,2008,1,5,1.3,3.3,,0.07,6.84
7,4347,Greg,Smith,Greg Smith,"Smith, Greg",G. Smith,greg-smith,1947-01-28T00:00:00,Western Kentucky,USA,...,Y,Y,1968,4,50,1.8,6.2,0.0,,8.84
8,2124,Martynas,Andriuskevicius,Martynas Andriuskevicius,"Andriuskevicius, Martynas",M. Andriuskevicius,martynas-andriuskevicius,1986-03-12T00:00:00,Zalgiris,Lithuania,...,Y,Y,2005,2,44,0.0,0.7,0.0,,1.04
9,2947,Jim,Brewer,Jim Brewer,"Brewer, Jim",J. Brewer,jim-brewer,1951-12-03T00:00:00,Minnesota,USA,...,Y,Y,1973,1,2,1.5,6.3,0.0,,6.84


Info gives us this:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                3150 non-null   int64  
 1   FIRST_NAME                        3150 non-null   object 
 2   LAST_NAME                         3150 non-null   object 
 3   DISPLAY_FIRST_LAST                3150 non-null   object 
 4   DISPLAY_LAST_COMMA_FIRST          3150 non-null   object 
 5   DISPLAY_FI_LAST                   3150 non-null   object 
 6   PLAYER_SLUG                       3150 non-null   object 
 7   BIRTHDATE                         3150 non-null   object 
 8   SCHOOL                            3149 non-null   object 
 9   COUNTRY                           3150 non-null   object 
 10  LAST_AFFILIATION                  3150 non-null   object 
 11  HEIGHT                            3085 non-null  

Unnamed: 0,id,FIRST_NAME,LAST_NAME,DISPLAY_FIRST_LAST,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FI_LAST,PLAYER_SLUG,BIRTHDATE,SCHOOL,COUNTRY,...,NBA_FLAG,GAMES_PLAYED_FLAG,DRAFT_YEAR,DRAFT_ROUND,DRAFT_NUMBER,AST,REB,ALL_STAR_APPEARANCES,PIE,PTS
0,2682,David,Vaughn,David Vaughn,"Vaughn, David",D. Vaughn,david-vaughn,1973-03-23T00:00:00,Memphis,USA,...,Y,Y,1995,1,25,0.3,3.1,0.0,,3.94
1,1486,Javaris,Crittenton,Javaris Crittenton,"Crittenton, Javaris",J. Crittenton,javaris-crittenton,1987-12-31T00:00:00,Georgia Tech,USA,...,Y,Y,2007,1,19,1.8,2.4,0.0,,6.34
2,595,Ed,Gray,Ed Gray,"Gray, Ed",E. Gray,ed-gray,1975-09-27T00:00:00,California,USA,...,Y,Y,1997,1,22,0.8,1.2,0.0,,7.24
3,940,Rakeem,Christmas,Rakeem Christmas,"Christmas, Rakeem",R. Christmas,rakeem-christmas,1991-12-01T00:00:00,Syracuse,USA,...,Y,Y,2015,2,36,0.1,1.9,0.0,,3.14
4,645,Stanley,Johnson,Stanley Johnson,"Johnson, Stanley",S. Johnson,stanley-johnson,1996-05-29T00:00:00,Arizona,USA,...,Y,Y,2015,1,8,1.2,2.2,,0.05,3.94
5,1782,Keith,Smith,Keith Smith,"Smith, Keith",K. Smith,keith-smith,1964-03-09T00:00:00,Loyola-Marymount,USA,...,Y,Y,1986,2,45,1.0,0.8,0.0,,4.34
6,944,Kevin,Love,Kevin Love,"Love, Kevin",K. Love,kevin-love,1988-09-07T00:00:00,UCLA,USA,...,Y,Y,2008,1,5,1.3,3.3,,0.07,6.84
7,4347,Greg,Smith,Greg Smith,"Smith, Greg",G. Smith,greg-smith,1947-01-28T00:00:00,Western Kentucky,USA,...,Y,Y,1968,4,50,1.8,6.2,0.0,,8.84
8,2124,Martynas,Andriuskevicius,Martynas Andriuskevicius,"Andriuskevicius, Martynas",M. Andriuskevicius,martynas-andriuskevicius,1986-03-12T00:00:00,Zalgiris,Lithuania,...,Y,Y,2005,2,44,0.0,0.7,0.0,,1.04
9,2947,Jim,Brewer,Jim Brewer,"Brewer, Jim",J. Brewer,jim-brewer,1951-12-03T00:00:00,Minnesota,USA,...,Y,Y,1973,1,2,1.5,6.3,0.0,,6.84


data_train_df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 37 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                3150 non-null   int64  
 1   FIRST_NAME                        3150 non-null   object 
 2   LAST_NAME                         3150 non-null   object 
 3   DISPLAY_FIRST_LAST                3150 non-null   object 
 4   DISPLAY_LAST_COMMA_FIRST          3150 non-null   object 
 5   DISPLAY_FI_LAST                   3150 non-null   object 
 6   PLAYER_SLUG                       3150 non-null   object 
 7   BIRTHDATE                         3150 non-null   object 
 8   SCHOOL                            3149 non-null   object 
 9   COUNTRY                           3150 non-null   object 
 10  LAST_AFFILIATION                  3150 non-null   object 
 11  HEIGHT                            3085 non-null

None

GNB0: nan
DTC: nan
DTC-5: nan
DTC-10: nan
5NN-C: nan
10NN-C: nan

Best pipeline: GNB0 (accuracy = nan)


Traceback (most recent call last):
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/naive_bayes.py", line 207, in fit
    X, y = self._validate_data(X, y)
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Users/bendesollar/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)

ValueError: could not convert string to float: 'Undrafted'