In [395]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import altair as alt
import evaluate_model

In [396]:
X_train = pd.read_csv('../../data/train_test_split/x_train_cluster.csv')
X_test = pd.read_csv('../../data/train_test_split/x_test_cluster.csv')
y_train = pd.read_csv('../../data/train_test_split/y_train.csv').drop(columns = 'Unnamed: 0').values.ravel()
y_test = pd.read_csv('../../data/train_test_split/y_test.csv').drop(columns = 'Unnamed: 0').values.ravel()

In [397]:
# Define numerical and categorical values
numerical_features = ['rating', 'ranking', 'height', 'weight', 'distance_miles', 'stars', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']
categorical_features = ['conference', 'side_of_ball', 'year', 'DBSCAN_Cluster_PCA', 'SVD_KMeans_Cluster', 'KMeans_Cluster']

In [398]:
log_model = LogisticRegression(class_weight='balanced', random_state=42, C=0.01, tol=0.01, max_iter=1000)

In [399]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', log_model)
])

## Used for Hyperparameter Selection and now commented out

In [400]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'tol': [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'penalty': ['l1', 'l2', 'elasticnet'],
}

logistic_reg =  LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

grid_search = GridSearchCV(logistic_reg, param_grid, cv=5, scoring='f1')

#grid_search.fit(preprocessor.fit_transform(X_train), y_train)

In [401]:
#print(grid_search.best_params_)
#print(grid_search.best_score_)

In [402]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [403]:
accuracy = round(accuracy_score(y_test, y_pred), 4)
print(f"accuracy score: {accuracy}")

accuracy score: 0.7479


In [404]:
f1 = round(float(f1_score(y_test, y_pred, average='micro')), 4)
print(f"Testing f1_score: {f1}")

f1 = round(float(f1_score(y_train, pipe.predict(X_train), average='micro')), 4)
print(f"Training f1_score: {f1}")

Testing f1_score: 0.7479
Training f1_score: 0.7415


In [405]:
score, f1_score, std = evaluate_model.evaluate_cross_val_score(log_model, preprocessor.transform(X_test), y_test)

In [406]:
f1_score, std

(np.float64(0.724812030075188), np.float64(0.01443709579096269))

In [407]:
confusion_matrix(y_test, y_pred)

array([[4183, 1309],
       [ 200,  293]])

In [408]:
onehot_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
processed_features = numerical_features + list(onehot_features)

In [409]:
coefficients = pd.DataFrame({'feature': processed_features,
                              'value': pipe.named_steps['classifier'].coef_[0]}).sort_values('value', ascending=False)

In [410]:
coefficients.head()
print(len(coefficients))

47


In [411]:
chart = alt.Chart(coefficients).mark_bar().encode(
    x=alt.X('value', sort=alt.EncodingSortField(field='value', order='descending')),
    y=alt.Y('feature', sort=None)
)
chart