In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
#clean the data
df_2024_2025 = pd.read_csv("../data/nbateams/nbateams2024-2025.csv")  
df_2024_2025 = df_2024_2025.dropna(axis=1, how='all')

df_2023_2024 = pd.read_csv("../data/nbateams/nbateams2023-2024.csv")  
df_2023_2024  = df_2023_2024.dropna(axis=1, how='all')

df_2022_2023 = pd.read_csv("../data/nbateams/nbateams2022-2023.csv")  
df_2022_2023  = df_2022_2023.dropna(axis=1, how='all')

In [5]:
# Ensure there are no NaNs in either dataset by dropping rows with NaNs
df_2024_2025 = df_2024_2025[['Team', 'W', 'L', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA']].dropna()
df_2023_2024 = df_2023_2024[['Team', 'W', 'L', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA']].dropna()
df_2022_2023 = df_2022_2023[['Team', 'W', 'L', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'FTr', '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA']].dropna()

# Combine both datasets into a single DataFrame
df = pd.concat([df_2024_2025, df_2023_2024, df_2022_2023], ignore_index=True)
print(df.head)
# Define a function to create pairwise feature differences between two teams
def create_features(team_a, team_b, data):
    team_a_data = data[data['Team'] == team_a].iloc[0]
    team_b_data = data[data['Team'] == team_b].iloc[0]
    # Calculate the difference in statistics (drop non-numeric columns)
    feature_diff = team_a_data.drop(['Team', 'W', 'L']) - team_b_data.drop(['Team', 'W', 'L'])
    return feature_diff

# Generate training data
X = []
y = []

# Generate pairwise combinations for training
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        team_a = df.iloc[i]
        team_b = df.iloc[j]
        # Create feature differences
        features = create_features(team_a['Team'], team_b['Team'], df)
        X.append(features.values)
        # Label as 1 if team A has more wins than team B, otherwise 0
        y.append(1 if team_a['W'] > team_b['W'] else 0)

# Convert X to a DataFrame and apply infer_objects to downcast types without NaN handling
X = pd.DataFrame(X)
X = X.infer_objects(copy=False)  # Apply downcasting
y = pd.Series(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2f}")

# Predict the winner between two teams
team_a = "Memphis Grizzlies"
team_b = "Boston Celtics"
new_features = create_features(team_a, team_b, df).infer_objects(copy=False)  # Downcast without NaN handling
prediction = model.predict([new_features])[0]
print(f"Predicted winner: {team_a if prediction == 1 else team_b}")

<bound method NDFrame.head of                       Team     W     L   ORtg   DRtg  NRtg   Pace    FTr  \
0    Golden State Warriors   6.0   1.0  120.0  102.9  17.1   99.8  0.260   
1    Oklahoma City Thunder   7.0   0.0  110.4   93.4  17.0  102.6  0.194   
2           Boston Celtics   7.0   1.0  122.6  108.7  13.9   99.6  0.259   
3      Cleveland Cavaliers   8.0   0.0  121.5  108.5  13.0  100.6  0.231   
4         Sacramento Kings   4.0   3.0  115.9  111.7   4.2  100.6  0.285   
..                     ...   ...   ...    ...    ...   ...    ...    ...   
85  Portland Trail Blazers  33.0  49.0  114.8  118.8  -4.0   98.6  0.289   
86       Charlotte Hornets  27.0  55.0  109.2  115.3  -6.1  100.8  0.261   
87         Houston Rockets  22.0  60.0  111.4  119.3  -7.9   99.0  0.285   
88         Detroit Pistons  17.0  65.0  110.7  118.9  -8.2   99.0  0.295   
89       San Antonio Spurs  22.0  60.0  110.2  120.0  -9.8  101.6  0.229   

     3PAr    TS%   eFG%  TOV%  ORB%  FT/FGA  
0   0.455  