# Import Libraries

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# Import Data

In [18]:
data = pd.read_excel('Dataset.xlsx') 

# Check Data

In [19]:
print("Missing Data Summary:")
print(data.isnull().sum())

Missing Data Summary:
Team         0
Match Up     0
Game Date    0
W/L          0
MIN          0
PTS          0
FGM          0
FGA          0
FG%          0
3PM          0
3PA          0
3P%          0
FTM          0
FTA          0
FT%          0
OREB         0
DREB         0
REB          0
AST          0
STL          0
BLK          0
TOV          0
PF           0
+/-          0
dtype: int64


In [28]:
non_numeric_values =data. applymap(lambda x: isinstance(x, str) and not x.isnumeric()).any()
print("Columns with non-numeric values:")
print(non_numeric_values[non_numeric_values].index.tolist())

Columns with non-numeric values:
['Team', 'Match Up', 'Game Date', 'W/L', 'FT%']


  non_numeric_values =data. applymap(lambda x: isinstance(x, str) and not x.isnumeric()).any()


In [29]:
data['FT%'] = data['FT%'].replace('-', pd.NA)
data['FT%'] = pd.to_numeric(data['FT%'], errors='coerce')
data['FT%'].fillna(data['FT%'].mean(), inplace=True)

# Model

## Construct new data frame to for model construction
- Group Team togethers
- Sort time in ascending order

Questions to use along the way: What to do with W/L, do I use W/L as the binary classifictaion or use the regression at first?

- What's the relationship between W/L and PTS?
- Can I predict PTS first and the W/L
- or make PTS as one of the feature?
- Do each team have their own function class?



In [21]:
df = data[['Team', 'Match Up', 'Game Date', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-']]
df['Home_Game'] = df['Match Up'].apply(lambda x: 1 if "vs." in x else 0)
df = df.drop(columns = 'Match Up')
df['Game Date'] = pd.to_datetime(df['Game Date'], format='%m/%d/%Y')
df = df.sort_values(by=['Team', 'Game Date']).reset_index(drop=True)
df['W/L'] = df['W/L'].map({'W': 1, 'L': 0})
# Clean the 'FT%' column specifically
df

Unnamed: 0,Team,Game Date,W/L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-,Home_Game
0,ATL,2023-10-25,0,240,110,39,93,41.9,5,29,...,12,30,42,24,12,1,12,19,-6,0
1,ATL,2023-10-27,0,240,120,42,87,48.3,12,32,...,9,35,44,28,7,6,14,20,-6,1
2,ATL,2023-10-29,1,240,127,47,93,50.5,15,37,...,13,33,46,32,15,2,17,17,17,0
3,ATL,2023-10-30,1,240,127,48,86,55.8,14,30,...,4,32,36,28,6,7,11,12,14,1
4,ATL,2023-11-01,1,240,130,46,92,50.0,9,32,...,14,43,57,26,8,3,21,16,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2455,WAS,2024-04-05,0,240,102,38,88,43.2,5,33,...,12,33,45,24,9,5,10,21,-6,1
2456,WAS,2024-04-07,0,240,122,42,89,47.2,16,40,...,8,35,43,29,9,11,15,25,-8,0
2457,WAS,2024-04-09,0,240,121,44,89,49.4,20,43,...,7,34,41,26,5,4,13,22,-9,0
2458,WAS,2024-04-12,0,240,127,45,86,52.3,18,43,...,4,35,39,32,6,5,12,22,-2,1


In [22]:
df_model = df[['Team','Home_Game', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', '+/-']]


train_data_list = []
test_data_list = []

df_model = df_model.sort_values(by='Team')  # Sort by team and game date
for team, group in df_model.groupby('Team'):
    train_size = int(len(group) * 0.7)
    train_data_list.append(group.iloc[:train_size])  # First 70% for training
    test_data_list.append(group.iloc[train_size:])  # Remaining 30% for testing

# Concatenate the training and testing data for all teams
train_data = pd.concat(train_data_list)
test_data = pd.concat(test_data_list)

X_train = train_data.drop(columns=['W/L', 'Team'])
y_train = train_data['W/L']
X_test = test_data.drop(columns=['W/L', 'Team'])
y_test= test_data['W/L']


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
lasso_model = LogisticRegression(penalty='l1', solver='saga', random_state=42, max_iter=10000)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(lasso_model, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

# Best model
best_lasso_model = grid.best_estimator_

# Make predictions
y_pred = best_lasso_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Feature importance
lasso_coefficients = best_lasso_model.coef_.flatten()
feature_importance = pd.DataFrame({'Feature': train_data.drop(columns=['W/L', 'Team']).columns, 'Coefficient': lasso_coefficients})
important_features = feature_importance[feature_importance['Coefficient'] != 0]

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report_str)
print("\nImportant Features:")
print(important_features)


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       370
           1       1.00      1.00      1.00       380

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750


Important Features:
   Feature  Coefficient
20     +/-     3.008799


In [23]:
df_model['FT%']

0        81.8
59       73.9
58       71.4
57       87.0
56       88.2
        ...  
2401     76.9
2400     79.2
2399     64.5
2396     71.0
2459    100.0
Name: FT%, Length: 2460, dtype: float64

In [27]:
df_model = df[['Team','Home_Game', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB',
       'AST', 'STL', 'BLK', 'TOV', 'PF']]


train_data_list = []
test_data_list = []

df_model = df_model.sort_values(by='Team')  # Sort by team and game date
for team, group in df_model.groupby('Team'):
    train_size = int(len(group) * 0.7)
    train_data_list.append(group.iloc[:train_size])  # First 70% for training
    test_data_list.append(group.iloc[train_size:])  # Remaining 30% for testing

# Concatenate the training and testing data for all teams
train_data = pd.concat(train_data_list)
test_data = pd.concat(test_data_list)

X_train = train_data.drop(columns=['W/L', 'Team'])
y_train = train_data['W/L']
X_test = test_data.drop(columns=['W/L', 'Team'])
y_test= test_data['W/L']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso_model = LogisticRegression(penalty='l1', solver='saga', random_state=42, max_iter=10000)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(lasso_model, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

best_lasso_model = grid.best_estimator_

y_pred = best_lasso_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

lasso_coefficients = best_lasso_model.coef_.flatten()
feature_importance = pd.DataFrame({'Feature': train_data.drop(columns=['W/L', 'Team']).columns, 'Coefficient': lasso_coefficients})
important_features = feature_importance[feature_importance['Coefficient'] != 0]

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report_str)
print("\nImportant Features:")
print(important_features)


Accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85       370
           1       0.85      0.87      0.86       380

    accuracy                           0.85       750
   macro avg       0.86      0.85      0.85       750
weighted avg       0.85      0.85      0.85       750


Important Features:
      Feature  Coefficient
0   Home_Game    -0.071743
1         MIN    -0.007383
2         PTS     1.705375
3         FGM     2.001316
4         FGA    -3.556771
5         FG%    -0.755286
6         3PM    -0.268279
7         3PA     0.371768
8         3P%     0.518743
9         FTM     0.168712
10        FTA    -0.587519
11        FT%     0.118982
12       OREB     0.598383
13       DREB     1.252217
14        REB     1.420059
15        AST     0.138601
16        STL     1.403231
17        BLK     0.439733
18        TOV    -1.351008
19         PF    -0.369412
