# Logistic Regression Models

In [1]:
# Created by Gio Romero-Ruiz
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np

# Loads the dataset
df = pd.read_csv('../../../data/season-data/2012_to_2024_data.csv')

# Convert 'date' column to datetime format and create a new 'season' column based on the month
df['date_temp'] = pd.to_datetime(df['date'])
df['season'] = df['date_temp'].apply(lambda x: x.year + 1 if x.month >= 10 else x.year)

# Removes the temporary 'date-temp' column
df.drop(columns=['date_temp'], inplace=True)

# sorts the dataframe by date
df = df.sort_values('date')

# Remove columns related to minutes played
del df['mp']
del df['mp_opp']

# add in target column to predict how team did in their next game
def add_target(team):
    team['target'] = team['won'].shift(-1)
    return team

# Apply the function to each team separately
df = df.groupby('team', group_keys=False).apply(add_target)

df['target'][pd.isnull(df['target'])] = 2
# find columns with null values
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]


# remove columns with null values
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

# model time 
lr = LogisticRegression(random_state= 0, max_iter=2000)

# prepare a time-series cross-validator
split = TimeSeriesSplit(n_splits=3)

# initialize sequential feature selector for feature selection
sfs = SequentialFeatureSelector(lr, n_features_to_select=10, direction='forward', cv=split)

# this line below specifies the columns to remove before feature selection
removed_columns = ['team', 'date', 'won', 'target', 'team_opp', 'season']
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

# performs the feature selection below
sfs.fit(df[selected_columns], df['target'])
predictors = list(selected_columns[sfs.get_support()])


# Function to backtest the model using historical data
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    # Extract a sorted list of unique seasons from the data to determine the temporal boundaries for training and testing.
    seasons = sorted(data['season'].unique())

    # Iterate over the seasons, starting from the specified start index and moving in steps defined by 'step'.
    for i in range(start, len(seasons), step):

        # Separate the data into training and testing sets:
        # Training data includes all data from seasons prior to the current season in the iteration.
        # Testing data includes all data from the current season.
        train = data[data['season'] < seasons[i]]
        test = data[data['season'] == seasons[i]]
        
        # Fit the model to the training data using the specified predictors and the target variable.
        model.fit(train[predictors], train['target'])

        # Predict the target variable for the testing data using the fitted model.
        predictions = model.predict(test[predictors])
        predictions = pd.Series(predictions, index=test.index)

        # Combine the actual target values and the predictions into a DataFrame for comparison.
        combined = pd.concat([test['target'], predictions], axis=1)
        combined.columns = ['actual', 'prediction']

        # Append the results of this season's predictions to the list of all predictions.
        all_predictions.append(combined)
    
    # Concatenate all prediction DataFrames into a single DataFrame to return the combined results.
    return pd.concat(all_predictions)

# Execute the backtest and compute the accuracy
predictions = backtest(df, lr, predictors)
accuracy_score(predictions['actual'], predictions['prediction'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'][pd.isnull(df['target'])] = 2


0.5353107344632768

In [24]:
# Now we will be intergrating the rolling averages feature
# We created a new dataframe which includes selected features along with 'won', 'team' and 'season' columns
df_rolling = df[list(selected_columns) + ['won', 'team', 'season']]

# We defined a function here to calculate rolling averages for each team's numneric data
def find_team_averages(team):
    # Select only numeric columns for rolling calculation
    numeric_cols = team.select_dtypes(include=[np.number])  # Using numpy to select numeric columns
    rolling = numeric_cols.rolling(10).mean()
    return rolling

# Apply the function to each group defined by 'team' and 'season' to maintain unique groups
# group_keys=False prevents adding an extra index level of the grouping keys
df_rolling = df_rolling.groupby(['team', 'season'], group_keys=False).apply(find_team_averages)

# Display the DataFrame with rolling averages
df_rolling


Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,drb,...,drb_opp,trb_opp,ast_opp,stl_opp,blk_opp,tov_opp,pf_opp,pts_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30747,0.476087,0.450725,0.454186,0.562069,0.590909,0.455938,0.278846,0.298438,0.344737,0.447917,...,0.437500,0.400000,0.395455,0.318182,0.175,0.441379,0.342105,0.381356,0.5,2024.0
30750,0.480435,0.436232,0.467674,0.451724,0.510606,0.413302,0.309615,0.329688,0.250000,0.445833,...,0.500000,0.431250,0.493182,0.400000,0.270,0.389655,0.384211,0.538983,0.2,2024.0
30748,0.460870,0.368116,0.497907,0.468966,0.501515,0.433729,0.351923,0.334375,0.260526,0.462500,...,0.429167,0.396875,0.445455,0.322727,0.175,0.406897,0.384211,0.414407,0.6,2024.0
30749,0.513043,0.446377,0.498372,0.527586,0.521212,0.473990,0.323077,0.312500,0.292105,0.460417,...,0.441667,0.425000,0.450000,0.345455,0.265,0.479310,0.328947,0.418644,0.9,2024.0


In [25]:
# Generate new column names for the rolling average columns
# This is done by appending '_rolling' to each original column name
# For example, if the original column name is 'points', the new name will be 'points_rolling'
rolling_cols = [f'{col}_rolling' for col in df_rolling.columns]

# Apply the new column names to the df_rolling DataFrame
df_rolling.columns = rolling_cols

# Will concatenate the original DataFrame(df) with the rolling averages DataFrame(df_rolling)
df = pd.concat([df, df_rolling], axis=1)

# Display the updated DataFrame which now includes the original data along with the rolling averages
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,drb,...,drb_opp_rolling,trb_opp_rolling,ast_opp_rolling,stl_opp_rolling,blk_opp_rolling,tov_opp_rolling,pf_opp_rolling,pts_opp_rolling,won_rolling,season_rolling
0,0.282609,0.434783,0.230233,0.275862,0.424242,0.296912,0.230769,0.312500,0.473684,0.187500,...,,,,,,,,,,
1,0.369565,0.275362,0.462791,0.241379,0.242424,0.415677,0.288462,0.343750,0.473684,0.500000,...,,,,,,,,,,
2,0.456522,0.362319,0.497674,0.172414,0.166667,0.395487,0.269231,0.281250,0.236842,0.395833,...,,,,,,,,,,
3,0.413043,0.246377,0.551163,0.103448,0.136364,0.274347,0.230769,0.484375,0.394737,0.395833,...,,,,,,,,,,
4,0.434783,0.217391,0.611628,0.206897,0.136364,0.548694,0.442308,0.437500,0.184211,0.458333,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30747,0.521739,0.550725,0.423256,0.620690,0.530303,0.548694,0.269231,0.234375,0.394737,0.458333,...,0.437500,0.400000,0.395455,0.318182,0.175,0.441379,0.342105,0.381356,0.5,2024.0
30750,0.478261,0.449275,0.451163,0.379310,0.500000,0.352732,0.442308,0.437500,0.210526,0.291667,...,0.500000,0.431250,0.493182,0.400000,0.270,0.389655,0.384211,0.538983,0.2,2024.0
30748,0.413043,0.434783,0.383721,0.482759,0.530303,0.426366,0.269231,0.296875,0.473684,0.354167,...,0.429167,0.396875,0.445455,0.322727,0.175,0.406897,0.384211,0.414407,0.6,2024.0
30749,0.369565,0.376812,0.376744,0.413793,0.469697,0.407363,0.403846,0.359375,0.394737,0.458333,...,0.441667,0.425000,0.450000,0.345455,0.265,0.479310,0.328947,0.418644,0.9,2024.0


In [26]:
# Remove rows with missing values from the DataFrame
df= df.dropna()

# Create a copy of the DataFrame to ensure that the original data is not modified by reference elsewhere in the code
df = df.copy()

# display the updated DataFrame that now only contains rows with complete data
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,drb,...,drb_opp_rolling,trb_opp_rolling,ast_opp_rolling,stl_opp_rolling,blk_opp_rolling,tov_opp_rolling,pf_opp_rolling,pts_opp_rolling,won_rolling,season_rolling
236,0.369565,0.246377,0.490698,0.448276,0.348485,0.571259,0.250000,0.250000,0.131579,0.250000,...,0.358333,0.378125,0.340909,0.322727,0.215,0.444828,0.428947,0.357627,0.7,2013.0
256,0.456522,0.202899,0.660465,0.482759,0.378788,0.573634,0.307692,0.312500,0.184211,0.354167,...,0.331250,0.350000,0.345455,0.372727,0.140,0.406897,0.468421,0.303390,0.7,2013.0
252,0.369565,0.304348,0.434884,0.241379,0.272727,0.377672,0.269231,0.281250,0.263158,0.375000,...,0.412500,0.409375,0.363636,0.413636,0.300,0.420690,0.436842,0.352542,0.4,2013.0
245,0.565217,0.463768,0.539535,0.172414,0.151515,0.423990,0.134615,0.234375,0.315789,0.291667,...,0.433333,0.437500,0.413636,0.268182,0.300,0.458621,0.415789,0.398305,0.4,2013.0
244,0.434783,0.333333,0.495349,0.206897,0.181818,0.445368,0.365385,0.484375,0.263158,0.500000,...,0.429167,0.414062,0.331818,0.390909,0.295,0.358621,0.428947,0.273729,0.4,2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30747,0.521739,0.550725,0.423256,0.620690,0.530303,0.548694,0.269231,0.234375,0.394737,0.458333,...,0.437500,0.400000,0.395455,0.318182,0.175,0.441379,0.342105,0.381356,0.5,2024.0
30750,0.478261,0.449275,0.451163,0.379310,0.500000,0.352732,0.442308,0.437500,0.210526,0.291667,...,0.500000,0.431250,0.493182,0.400000,0.270,0.389655,0.384211,0.538983,0.2,2024.0
30748,0.413043,0.434783,0.383721,0.482759,0.530303,0.426366,0.269231,0.296875,0.473684,0.354167,...,0.429167,0.396875,0.445455,0.322727,0.175,0.406897,0.384211,0.414407,0.6,2024.0
30749,0.369565,0.376812,0.376744,0.413793,0.469697,0.407363,0.403846,0.359375,0.394737,0.458333,...,0.441667,0.425000,0.450000,0.345455,0.265,0.479310,0.328947,0.418644,0.9,2024.0


In [27]:
# This function is designed to help retrieve the next value of a column for each entry within the same team.
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col


# This function is particularly useful for creating new columns that anticipate future events or states, like the next opponent.
def add_col(team, col_name):
    return df.groupby('team', group_keys=False).apply(lambda x: shift_col(x, col_name))


# Add a new column 'team_opp_next' to the DataFrame which contains the next opponent for each team.
df["team_opp_next"] = add_col(df, "team_opp")

# This column is also created using the add_col function with 'date' as the column to shift.
df["date_next"] = add_col(df, "date")

In [28]:
# Displays the updated DataFrame
df

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,drb,...,ast_opp_rolling,stl_opp_rolling,blk_opp_rolling,tov_opp_rolling,pf_opp_rolling,pts_opp_rolling,won_rolling,season_rolling,team_opp_next,date_next
236,0.369565,0.246377,0.490698,0.448276,0.348485,0.571259,0.250000,0.250000,0.131579,0.250000,...,0.340909,0.322727,0.215,0.444828,0.428947,0.357627,0.7,2013.0,PHO,2012-11-17
256,0.456522,0.202899,0.660465,0.482759,0.378788,0.573634,0.307692,0.312500,0.184211,0.354167,...,0.345455,0.372727,0.140,0.406897,0.468421,0.303390,0.7,2013.0,GSW,2012-11-18
252,0.369565,0.304348,0.434884,0.241379,0.272727,0.377672,0.269231,0.281250,0.263158,0.375000,...,0.363636,0.413636,0.300,0.420690,0.436842,0.352542,0.4,2013.0,WAS,2012-11-17
245,0.565217,0.463768,0.539535,0.172414,0.151515,0.423990,0.134615,0.234375,0.315789,0.291667,...,0.413636,0.268182,0.300,0.458621,0.415789,0.398305,0.4,2013.0,MIA,2012-11-17
244,0.434783,0.333333,0.495349,0.206897,0.181818,0.445368,0.365385,0.484375,0.263158,0.500000,...,0.331818,0.390909,0.295,0.358621,0.428947,0.273729,0.4,2013.0,NYK,2012-11-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30747,0.521739,0.550725,0.423256,0.620690,0.530303,0.548694,0.269231,0.234375,0.394737,0.458333,...,0.395455,0.318182,0.175,0.441379,0.342105,0.381356,0.5,2024.0,,
30750,0.478261,0.449275,0.451163,0.379310,0.500000,0.352732,0.442308,0.437500,0.210526,0.291667,...,0.493182,0.400000,0.270,0.389655,0.384211,0.538983,0.2,2024.0,,
30748,0.413043,0.434783,0.383721,0.482759,0.530303,0.426366,0.269231,0.296875,0.473684,0.354167,...,0.445455,0.322727,0.175,0.406897,0.384211,0.414407,0.6,2024.0,,
30749,0.369565,0.376812,0.376744,0.413793,0.469697,0.407363,0.403846,0.359375,0.394737,0.458333,...,0.450000,0.345455,0.265,0.479310,0.328947,0.418644,0.9,2024.0,,


In [29]:
# Create a copy of the DataFrame to ensure that subsequent operations do not modify the original DataFrame by reference.
df = df.copy()


# Merge the DataFrame with a subset of itself that includes only the rolling average columns and key columns for matching.
# The purpose of this merge is to align data from one row in 'df' with data from another row where the first row's 'team' and 'date_next'
# match the second row's 'team_opp_next' and 'date_next'. This aligns teams with their next opponents based on the date of the next game.
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

# Display the DataFrame 'full' which now includes merged data from the same DataFrame based on specified conditions.
full

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,drb,...,ast_opp_rolling_y,stl_opp_rolling_y,blk_opp_rolling_y,tov_opp_rolling_y,pf_opp_rolling_y,pts_opp_rolling_y,won_rolling_y,season_rolling_y,team_opp_next_y,team_y
0,0.369565,0.246377,0.490698,0.448276,0.348485,0.571259,0.250000,0.250000,0.131579,0.250000,...,0.413636,0.268182,0.300,0.458621,0.415789,0.398305,0.4,2013.0,MIA,PHO
1,0.565217,0.463768,0.539535,0.172414,0.151515,0.423990,0.134615,0.234375,0.315789,0.291667,...,0.340909,0.322727,0.215,0.444828,0.428947,0.357627,0.7,2013.0,PHO,MIA
2,0.543478,0.318841,0.651163,0.068966,0.060606,0.296912,0.307692,0.312500,0.157895,0.250000,...,0.377273,0.350000,0.240,0.441379,0.421053,0.335593,0.6,2013.0,DET,BOS
3,0.260870,0.318841,0.281395,0.068966,0.075758,0.263658,0.365385,0.421875,0.684211,0.458333,...,0.354545,0.409091,0.305,0.448276,0.384211,0.362712,0.4,2013.0,UTA,HOU
4,0.326087,0.275362,0.402326,0.172414,0.212121,0.330166,0.288462,0.328125,0.236842,0.354167,...,0.352273,0.404545,0.215,0.527586,0.460526,0.371186,0.5,2013.0,PHO,POR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27327,0.565217,0.492754,0.516279,0.482759,0.590909,0.387173,0.365385,0.359375,0.315789,0.375000,...,0.406818,0.313636,0.190,0.420690,0.328947,0.392373,0.4,2024.0,GSW,SAC
27328,0.478261,0.521739,0.395349,0.482759,0.606061,0.377672,0.211538,0.234375,0.342105,0.666667,...,0.463636,0.331818,0.185,0.389655,0.392105,0.421186,0.6,2024.0,PHI,MIA
27329,0.434783,0.420290,0.420930,0.413793,0.484848,0.395487,0.480769,0.421875,0.236842,0.229167,...,0.479545,0.327273,0.200,0.434483,0.352632,0.466949,0.5,2024.0,ATL,CHI
27330,0.586957,0.391304,0.632558,0.310345,0.484848,0.296912,0.326923,0.296875,0.342105,0.437500,...,0.461364,0.327273,0.270,0.455172,0.321053,0.422034,0.8,2024.0,MIA,PHI


In [30]:
# Select and display specific columns from the 'full' DataFrame to examine particular relationships and data alignments.
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,MIA,PHO,PHO,MIA,2012-11-17
1,PHO,MIA,MIA,PHO,2012-11-17
2,DET,BOS,BOS,DET,2012-11-18
3,UTA,HOU,HOU,UTA,2012-11-19
4,PHO,POR,POR,PHO,2012-11-21
...,...,...,...,...,...
27327,GSW,SAC,SAC,GSW,2024-04-16
27328,PHI,MIA,MIA,PHI,2024-04-17
27329,ATL,CHI,CHI,ATL,2024-04-17
27330,MIA,PHI,PHI,MIA,2024-04-17


In [31]:
# Identify and list all columns in the 'full' DataFrame that have data type 'object', typically indicating categorical or textual data.
# This step is useful for preprocessing data, especially before applying machine learning algorithms that require numerical input.
removed_columns = list(full.columns[full.dtypes == 'object']) + removed_columns

# Display the updated list of columns that are identified for removal.
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'team',
 'date',
 'won',
 'target',
 'team_opp',
 'season']

In [32]:
# Create a list of columns that are not in the 'removed_columns' list.
# This involves filtering the 'full' DataFrame's columns by excluding those identified as needing removal.
selected_columns = full.columns[~full.columns.isin(removed_columns)]

# Apply the Sequential Feature Selector (SFS) to the 'full' DataFrame using the filtered 'selected_columns'.
# The 'target' column is used as the dependent variable in the model fitting process.
# SFS is a feature selection method that iteratively adds (or removes) features to find the set of features that best improves the performance of the model.
# It fits the Logistic Regression model specified earlier in the workflow to the selected features to identify the most informative ones.
sfs.fit(full[selected_columns], full['target'])

In [33]:
# Retrieve the names of the columns that the Sequential Feature Selector (SFS) identified as the most predictive.
predictors = list(selected_columns[sfs.get_support()])

# Display the list of predictor columns.
predictors

['tov',
 'orb_opp_rolling_x',
 'trb_opp_rolling_x',
 'blk_opp_rolling_x',
 'won_rolling_x',
 'fg%_rolling_y',
 'drb_rolling_y',
 'blk_rolling_y',
 'fga_opp_rolling_y',
 'won_rolling_y']

In [34]:
# Call the backtest function with the DataFrame 'full', a Logistic Regression model 'lr', and the list of selected predictor columns.
# The function 'backtest' is designed to train and test the model on different subsets of the data defined by 'season' or similar splits,
# simulating a real-world application where models are trained on past data and tested on future, unseen data.
predictions = backtest(full, lr, predictors)

# Calculate the accuracy score, which is the proportion of correct predictions out of all predictions made.
accuracy_score(predictions['actual'], predictions['prediction'])

0.6185262042474281