In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("../data/nbateams.csv")  # Replace with the actual path to your CSV file

# Check if data loaded correctly
print(df.head())

    Rk                   Team  G     MP    FG   FGA    FG%    3P   3PA    3P%  \
0  1.0         Boston Celtics  8  243.1  42.5  92.3  0.461  19.0  50.9  0.373   
1  2.0    Cleveland Cavaliers  8  240.0  46.1  88.1  0.523  15.1  36.6  0.413   
2  3.0  Golden State Warriors  7  243.6  44.0  92.3  0.477  16.4  42.0  0.391   
3  4.0        Toronto Raptors  8  246.3  44.8  94.6  0.473  10.9  32.1  0.339   
4  5.0         Indiana Pacers  7  247.1  44.7  89.4  0.500  11.7  32.9  0.357   

   ...    FT%   ORB   DRB   TRB   AST  STL  BLK   TOV    PF    PTS  
0  ...  0.822  11.0  32.8  43.8  23.9  8.8  4.8  11.8  16.4  123.6  
1  ...  0.730   7.6  34.0  41.6  28.3  9.9  5.0  12.4  20.8  122.3  
2  ...  0.714  13.9  35.1  49.0  30.4  9.7  6.4  13.9  22.6  121.6  
3  ...  0.756  14.5  31.0  45.5  31.3  9.1  5.9  17.4  26.6  119.4  
4  ...  0.729   9.6  32.6  42.1  30.1  7.3  4.7  15.3  23.4  118.4  

[5 rows x 25 columns]


In [5]:
def create_features(team_a, team_b, data):
    team_a_data = data[data['Team'] == team_a].iloc[0]
    team_b_data = data[data['Team'] == team_b].iloc[0]
    # Calculate feature differences (excluding non-numeric columns like 'Team')
    feature_diff = team_a_data.drop(['Team', 'Rk']) - team_b_data.drop(['Team', 'Rk'])
    return feature_diff

# Create training data
X = []
y = []

# Generate pairwise combinations for each team and assign labels based on PTS as a proxy for win/loss
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        team_a = df.iloc[i]
        team_b = df.iloc[j]
        # Generate feature differences
        features = create_features(team_a['Team'], team_b['Team'], df)
        X.append(features.values)
        # Label as 1 if team A has higher PTS, else 0
        y.append(1 if team_a['PTS'] > team_b['PTS'] else 0)

# Convert to DataFrame for model training
X = pd.DataFrame(X)
y = pd.Series(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2f}")

# Prediction example for two specific teams
team_a = "Boston Celtics"
team_b = "Cleveland Cavaliers"
new_features = create_features(team_a, team_b, df)
prediction = model.predict([new_features])[0]
print(f"Predicted winner: {team_a if prediction == 1 else team_b}")

Model Accuracy: 0.99
Predicted winner: Boston Celtics
