# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Import Data

In [2]:
data = pd.read_excel('Resources/Dataset.xlsx') 

# Check Data

In [3]:
print("Missing Data Summary:")
print(data.isnull().sum())

Missing Data Summary:
Team         0
Match Up     0
Game Date    0
W/L          0
MIN          0
PTS          0
FGM          0
FGA          0
FG%          0
3PM          0
3PA          0
3P%          0
FTM          0
FTA          0
FT%          0
OREB         0
DREB         0
REB          0
AST          0
STL          0
BLK          0
TOV          0
PF           0
+/-          0
dtype: int64


In [4]:
non_numeric_values =data. applymap(lambda x: isinstance(x, str) and not x.isnumeric()).any()
print("Columns with non-numeric values:")
print(non_numeric_values[non_numeric_values].index.tolist())

Columns with non-numeric values:
['Team', 'Match Up', 'Game Date', 'W/L', 'FT%']


  non_numeric_values =data. applymap(lambda x: isinstance(x, str) and not x.isnumeric()).any()


In [5]:
data['FT%'] = data['FT%'].replace('-', pd.NA)
data['FT%'] = pd.to_numeric(data['FT%'], errors='coerce')
data['FT%'].fillna(data['FT%'].mean(), inplace=True)

# Model

## Construct new data frame to for model construction
- get rid of '+/-'
- 'Home_Game'
- 'W/L'



In [6]:
df = data[['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF']]
df['Home_Game'] = df['Match Up'].apply(lambda x: 1 if "vs." in x else 0)
df = df.drop(columns = 'Match Up')
df['Game Date'] = pd.to_datetime(df['Game Date'], format='%m/%d/%Y')
df = df.sort_values(by=['Team', 'Game Date']).reset_index(drop=True)
df

Unnamed: 0,Team,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,Home_Game
0,ATL,2023-10-25,L,240,110,39,93,41.9,5,29,...,81.8,12,30,42,24,12,1,12,19,0
1,ATL,2023-10-27,L,240,120,42,87,48.3,12,32,...,80.0,9,35,44,28,7,6,14,20,1
2,ATL,2023-10-29,W,240,127,47,93,50.5,15,37,...,81.8,13,33,46,32,15,2,17,17,0
3,ATL,2023-10-30,W,240,127,48,86,55.8,14,30,...,94.4,4,32,36,28,6,7,11,12,1
4,ATL,2023-11-01,W,240,130,46,92,50.0,9,32,...,90.6,14,43,57,26,8,3,21,16,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455,WAS,2024-04-05,L,240,102,38,88,43.2,5,33,...,70.0,12,33,45,24,9,5,10,21,1
2456,WAS,2024-04-07,L,240,122,42,89,47.2,16,40,...,73.3,8,35,43,29,9,11,15,25,0
2457,WAS,2024-04-09,L,240,121,44,89,49.4,20,43,...,76.5,7,34,41,26,5,4,13,22,0
2458,WAS,2024-04-12,L,240,127,45,86,52.3,18,43,...,65.5,4,35,39,32,6,5,12,22,1


## Splitting Data
- Each team will have 70% training data and 30% testing data

In [7]:
from utils import splitting_data
X_train, y_train, X_test, y_test = splitting_data(df)
X_train = X_train.drop(columns = 'Game Date')
y_train = y_train.drop(columns = 'Game Date')
X_test = X_test.drop(columns = 'Game Date')

## Standardize data set

In [8]:
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

## Logistic Regression + LASSO

In [9]:
from utils import lasso_regression
model, y_pred = lasso_regression(X_train, y_train, X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8546666666666667

## Weighting Statistics
### Hyperbolic Decay

- **Function**:  
  weight = 1 / (1 + k × days_ago)

- **Description**:  
  Weights decrease in a hyperbolic manner, providing a balance between linear and exponential decay.

- **Use Case**:  
  Useful when a moderate decline in importance with time is desired.


### Base Model of adding weight statistics

In [12]:
X_train, y_train, X_test, y_test = splitting_data(df)

def assign_weights(df_subset):
    reference_date = df_subset['Game Date'].max()
    days_ago = (reference_date - df_subset['Game Date']).dt.days
    df_subset['weight'] = 1 / (1 + k * days_ago)
    return df_subset

k = 0.1  # Example decay factor; adjust as needed
X_train = assign_weights(X_train)
X_test = assign_weights(X_test)
X_train = X_train.drop(columns = 'Game Date')
X_test = X_test.drop(columns = 'Game Date')
y_train = y_train.drop(columns = 'Game Date')
param_grid = {'k': [0.01, 0.1, 1, 10, 100]}
def weighted_accuracy(estimator, X, y, sample_weight):
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred, sample_weight=sample_weight)

    
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

model, y_pred = lasso_regression(X_train, y_train, X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.852

### Fine tuning the statistics

In [27]:
X_train, y_train, X_test, y_test = splitting_data(df)
def assign_weights(df_subset, k):
    reference_date = df_subset['Game Date'].max()
    days_ago = (reference_date - df_subset['Game Date']).dt.days
    df_subset['weight']= 1 / (1 + k * days_ago)


param_grid = {'k': [0.01, 0.1, 1, 10, 100]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_k = None
best_score = -np.inf

for k in param_grid['k']:
    # Assign weights
    assign_weights(X_train, k)

    # Drop 'Game Date' column
    X_train_features = X_train.drop(columns='Game Date')
    X_test_features = X_test.drop(columns='Game Date')

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_features)
    X_test_scaled = scaler.transform(X_test_features)

    # Cross-validation
    cv_scores = []
    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        X_cv_train, X_cv_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        sample_weights_cv = X_train['weight'].iloc[val_idx]

        # Train model
        model, _ = lasso_regression(X_cv_train, y_cv_train, X_cv_val)
        y_cv_pred = model.predict(X_cv_val)
        score = accuracy_score(y_cv_val, y_cv_pred, sample_weight=sample_weights_cv)
        cv_scores.append(score)

    mean_cv_score = np.mean(cv_scores)

    if mean_cv_score > best_score:
        best_score = mean_cv_score
        best_k = k


# Assign weights to test set
X_test['weight'] = assign_weights(X_test, best_k)
sample_weights_test = X_test.pop('weight')

# Drop 'Game Date' column
X_test_features = X_test.drop(columns='Game Date')

# Standardize features
X_test_scaled = scaler.transform(X_test_features)

# Train final model on full training set
X_train['weight'] = assign_weights(X_train, best_k)
sample_weights = X_train.pop('weight')
X_train_features = X_train.drop(columns='Game Date')
X_train_scaled = scaler.fit_transform(X_train_features)
final_model, _ = lasso_regression(X_train_scaled, y_train, X_test_scaled)

# Predict and evaluate
y_test_pred = final_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred, sample_weight=sample_weights_test)

print(f"Optimal decay factor k: {best_k}")
print(f"Test Accuracy: {test_accuracy:.4f}")




ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- weight


## 