# Logistic Regression Models

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score


# gather the train and test data
train_data = pd.read_csv('../../../data/season-data/2012_to_2023_data.csv')
test_data = pd.read_csv('../../../data/season-data/combined_2024.csv')

train_data = pd.get_dummies(train_data, columns=['team', 'team_opp'])
test_data = pd.get_dummies(test_data, columns=['team', 'team_opp'])


# Convert 'date' column to datetime type
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Extract features
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data['day_of_week'] = train_data['date'].dt.dayofweek

test_data['year'] = test_data['date'].dt.year
test_data['month'] = test_data['date'].dt.month
test_data['day'] = test_data['date'].dt.day
test_data['day_of_week'] = test_data['date'].dt.dayofweek

train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

features = [ 'fg%', '3p%', 'ft%', 'orb', 'drb', 'trb', 'stl', 'blk', 'tov', 'pf', 'fg%_opp', '3p%_opp',  'ft%_opp', 'orb_opp', 'drb_opp', 'trb_opp', 'stl_opp', 'blk_opp', 'tov_opp', 'pf_opp']
X_train = train_data[features]
Y_train = train_data['won']

X_test = test_data[features]
Y_test = test_data['won']

model = LogisticRegression(random_state= 0, max_iter=2000)

model.fit(X_train, Y_train)

train_pred = model.predict(X_train)

test_predictions = model.predict(X_test)

accuracy_train = accuracy_score(Y_train, train_pred)
accuracy = accuracy_score(Y_test, test_predictions)

conf_matrix = confusion_matrix(Y_test, test_predictions)
print("Confusion Matrix:\n", conf_matrix)

print("Accuracy:", accuracy)
print("accuracy: ", accuracy_train)

cv_scores = cross_val_score(model, X_train, Y_train, cv=15)
print("Cross-validation accuracies:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Confusion Matrix:
 [[1153   75]
 [  75 1153]]
Accuracy: 0.9389250814332247
accuracy:  0.9240175289793611
Cross-validation accuracies: [0.91308956 0.93481717 0.93110758 0.91732909 0.92739799 0.92368839
 0.91198303 0.93107105 0.92682927 0.92364793 0.92152704 0.91198303
 0.93425239 0.9300106  0.91728526]
Mean CV accuracy: 0.9237346260239404
