In [29]:
import pandas as pd

In [30]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [31]:
raw_df = pd.read_csv('data_pipeline/data/clean_nba_moon_data.csv')

In [32]:
def wrangle(df):
    
    column_abbreviations = {
        'distance_from_earth_au': 'dist_earth_au',
        'distance_from_earth_km': 'dist_earth_km',
        'horizontal_position_altitude_degrees': 'horiz_pos_alt',
        'horizontal_position_azimuth_degrees': 'horiz_pos_azi',
        'equatorial_position_right_ascension': 'equat_pos_asc',
        'equatorial_position_declination': 'equat_pos_dec',
        'position_constellation_name': 'constellation',
        'phase_string': 'phase'
    }
    
    clean_df = df.rename(columns=column_abbreviations)
    
    clean_df['impact'] = clean_df['plus_minus'].apply(lambda x: 'negative' if x < 0 else ('positive' if x > 0 else 'neutral'))
    
    clean_df = clean_df[clean_df['impact'] != 'neutral']

    return clean_df

In [33]:
df = wrangle(raw_df)
df.head()

Unnamed: 0,season_year,player_id,player_name,team_abbreviation,game_id,game_date,matchup,wl,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,tov,stl,blk,blka,pf,pfd,pts,plus_minus,nba_fantasy_pts,available_flag,home_team,latitude,longitude,dist_earth_au,dist_earth_km,horiz_pos_alt,horiz_pos_azi,equat_pos_asc,equat_pos_dec,constellation,elongation,magnitude,phase,impact
0,2020-21,1627788.0,Furkan Korkmaz,PHI,22000243.0,2021-01-23,PHI @ DET,W,13.076667,2.0,3.0,0.667,2.0,3.0,0.667,2.0,2.0,1.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,-5.0,12.8,1.0,DET,42.696944,-83.245556,0.00266,398119.52163,45.36,249.4,3.7,17.43,Taurus,114.7761,-10.88707,Waxing Gibbous,negative
1,2020-21,1629003.0,Shake Milton,PHI,22000243.0,2021-01-23,PHI @ DET,W,21.666667,3.0,9.0,0.333,1.0,2.0,0.5,1.0,1.0,1.0,2.0,1.0,3.0,3.0,2.0,0.0,0.0,2.0,3.0,1.0,8.0,-5.0,14.1,1.0,DET,42.696944,-83.245556,0.00266,398119.52163,45.36,249.4,3.7,17.43,Taurus,114.7761,-10.88707,Waxing Gibbous,negative
2,2020-21,1626143.0,Jahlil Okafor,DET,22000243.0,2021-01-23,DET vs. PHI,L,9.166667,3.0,6.0,0.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,6.0,-8.0,7.5,1.0,DET,42.696944,-83.245556,0.00266,398119.52163,45.36,249.4,3.7,17.43,Taurus,114.7761,-10.88707,Waxing Gibbous,negative
3,2020-21,1626153.0,Delon Wright,DET,22000243.0,2021-01-23,DET vs. PHI,L,35.683333,3.0,8.0,0.375,1.0,2.0,0.5,3.0,4.0,0.75,1.0,6.0,7.0,6.0,2.0,0.0,1.0,0.0,2.0,4.0,10.0,-6.0,28.4,1.0,DET,42.696944,-83.245556,0.00266,398119.52163,45.36,249.4,3.7,17.43,Taurus,114.7761,-10.88707,Waxing Gibbous,negative
4,2020-21,1629680.0,Matisse Thybulle,PHI,22000243.0,2021-01-23,PHI @ DET,W,16.688333,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,5.0,1.0,0.0,-14.0,1.7,1.0,DET,42.696944,-83.245556,0.00266,398119.52163,45.36,249.4,3.7,17.43,Taurus,114.7761,-10.88707,Waxing Gibbous,negative


## Split Data

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [37]:
target = 'impact'

features = ['player_name', 'latitude', 'longitude', 'dist_earth_km', 'horiz_pos_alt', 'horiz_pos_azi', 'equat_pos_asc', 'equat_pos_dec',
            'constellation', 'elongation', 'magnitude', 'phase']

X = df[features]
y = df[target]

categorical_features = ['player_name', 'constellation', 'phase']
numerical_features = ['latitude', 'longitude', 'dist_earth_km', 'horiz_pos_alt', 'horiz_pos_azi', 'equat_pos_asc', 'equat_pos_dec',
                      'elongation', 'magnitude']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

X_train_preprocessed.shape, X_test_preprocessed.shape

((94580, 986), (23646, 986))

## Establish a Baseline

In [39]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the dummy classifier to predict the most frequent class
dummy_clf = DummyClassifier(strategy="most_frequent")

# Train the dummy classifier on the training data
dummy_clf.fit(X_train_preprocessed, y_train)

# Predict on the training and test sets
y_train_pred_baseline = dummy_clf.predict(X_train_preprocessed)
y_test_pred_baseline = dummy_clf.predict(X_test_preprocessed)

# Evaluate the baseline
print("Baseline Training Accuracy:", accuracy_score(y_train, y_train_pred_baseline))
print("Baseline Test Accuracy:", accuracy_score(y_test, y_test_pred_baseline))
print("\nClassification Report (Test Set):\n", classification_report(y_test, y_test_pred_baseline, zero_division=0))

Baseline Training Accuracy: 0.5134595051807993
Baseline Test Accuracy: 0.5192844455721898

Classification Report (Test Set):
               precision    recall  f1-score   support

    negative       0.52      1.00      0.68     12279
    positive       0.00      0.00      0.00     11367

    accuracy                           0.52     23646
   macro avg       0.26      0.50      0.34     23646
weighted avg       0.27      0.52      0.35     23646



## Build and Test (Logistic Regression)

In [40]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Train the model
lr_model.fit(X_train_preprocessed, y_train)

# Predict on the training and test sets
y_train_pred_lr = lr_model.predict(X_train_preprocessed)
y_test_pred_lr = lr_model.predict(X_test_preprocessed)

# Evaluate the model
print("Logistic Regression Training Accuracy:", accuracy_score(y_train, y_train_pred_lr))
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred_lr))
print("\nLogistic Regression Classification Report (Test Set):\n", classification_report(y_test, y_test_pred_lr, zero_division=0))

Logistic Regression Training Accuracy: 0.5696553182491013
Logistic Regression Test Accuracy: 0.5482956948321069

Logistic Regression Classification Report (Test Set):
               precision    recall  f1-score   support

    negative       0.56      0.61      0.58     12279
    positive       0.53      0.49      0.51     11367

    accuracy                           0.55     23646
   macro avg       0.55      0.55      0.55     23646
weighted avg       0.55      0.55      0.55     23646



### Hyperparameter Tuning

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'solver': ['newton-cg', 'lbfgs', 'liblinear'], 
    'max_iter': [100, 200, 500] 
}

# Initialize the Logistic Regression model
lr = LogisticRegression()

# Initialize GridSearchCV
grid_search_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit GridSearchCV to the preprocessed training data
grid_search_lr.fit(X_train_preprocessed, y_train)

# Print the best parameters found
print("Best parameters found: ", grid_search_lr.best_params_)

# Use the best model to make predictions
y_train_pred_best_lr = grid_search_lr.predict(X_train_preprocessed)
y_test_pred_best_lr = grid_search_lr.predict(X_test_preprocessed)

# Evaluate the tuned model
print("Tuned Logistic Regression Training Accuracy:", accuracy_score(y_train, y_train_pred_best_lr))
print("Tuned Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred_best_lr))
print("\nTuned Logistic Regression Classification Report (Test Set):\n", classification_report(y_test, y_test_pred_best_lr))

Fitting 5 folds for each of 54 candidates, totalling 270 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters found:  {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Tuned Logistic Regression Training Accuracy: 0.5697081835483189
Tuned Logistic Regression Test Accuracy: 0.5529899348727058

Tuned Logistic Regression Classification Report (Test Set):
               precision    recall  f1-score   support

    negative       0.56      0.65      0.60     12279
    positive       0.54      0.45      0.49     11367

    accuracy                           0.55     23646
   macro avg       0.55      0.55      0.55     23646
weighted avg       0.55      0.55      0.55     23646



## Build and Test (Random Forest)

In [43]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_classifier.fit(X_train_preprocessed, y_train)

# Predict on the training and test sets
y_train_pred_rf = rf_classifier.predict(X_train_preprocessed)
y_test_pred_rf = rf_classifier.predict(X_test_preprocessed)

# Evaluate the model
print("Random Forest Training Accuracy:", accuracy_score(y_train, y_train_pred_rf))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_test_pred_rf))
print("\nRandom Forest Classification Report (Test Set):\n", classification_report(y_test, y_test_pred_rf))

Random Forest Training Accuracy: 1.0
Random Forest Test Accuracy: 0.493825594180834

Random Forest Classification Report (Test Set):
               precision    recall  f1-score   support

    negative       0.51      0.58      0.54     12279
    positive       0.47      0.41      0.44     11367

    accuracy                           0.49     23646
   macro avg       0.49      0.49      0.49     23646
weighted avg       0.49      0.49      0.49     23646



In [44]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4], 
    'bootstrap': [True, False]  
}

rf = RandomForestClassifier(random_state=42)

rf_random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)

rf_random_search.fit(X_train_preprocessed, y_train)

y_train_pred_best_rf = rf_random_search.predict(X_train_preprocessed)
y_test_pred_best_rf = rf_random_search.predict(X_test_preprocessed)

print("Best parameters found: ", rf_random_search.best_params_)
print("Tuned Random Forest Training Accuracy:", accuracy_score(y_train, y_train_pred_best_rf))
print("Tuned Random Forest Test Accuracy:", accuracy_score(y_test, y_test_pred_best_rf))
print("\nTuned Random Forest Classification Report (Test Set):\n", classification_report(y_test, y_test_pred_best_rf))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': False}
Tuned Random Forest Training Accuracy: 0.5613448932120956
Tuned Random Forest Test Accuracy: 0.5378922439313203

Tuned Random Forest Classification Report (Test Set):
               precision    recall  f1-score   support

    negative       0.53      0.93      0.68     12279
    positive       0.61      0.11      0.19     11367

    accuracy                           0.54     23646
   macro avg       0.57      0.52      0.43     23646
weighted avg       0.57      0.54      0.44     23646

