In [1]:
# importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from minio import Minio
from sklearn.ensemble import RandomForestClassifier
from helpers import read_data_from_minio, read_data

In [2]:
# specify path to the local data directory 
data_path = f"{os.path.abspath(os.path.join(os.getcwd(), os.pardir))}/data/"
data_path

'/Users/abdessamadbaahmed/Desktop/livrable_mp_data/data/'

In [3]:
# initialize minioClient with an endpoint and access/secret keys.
minio_client = Minio('20.224.70.229:9000',
                    access_key='abdessamadbaahmed',
                    secret_key='baahmedabdessamad', secure=False)

# list all buckets
buckets = minio_client.list_buckets()
buckets

[Bucket('nba-investment-data')]

In [4]:
# read the raw dataset from minio bucket
try:
    df = read_data_from_minio(minio_client, "nba-investment-data", "nba_logreg_preprocessed.csv")
    display(df.head())
except Exception:
    df = read_data(f"{data_path}nba_logreg_preprocessed.csv")
    df.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,36.0,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35.0,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,74.0,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58.0,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48.0,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [5]:
def feature_importance_logistic_regression(features, target, metric="f1", penalty="l2"):
    """
    this function takes the features and the target and the metric to maximize and the penalty to use and returns a bar chart of the feature importance

    :param features: the features of the dataset
    :param target: the target of the dataset
    :param metric: the metric to maximize
    :param penalty: the penalty to use

    :return: a bar chart of the feature importance
    """

    # Create a pipeline that scales the features and trains a logistic regression model with the specified penalty
    lr_model = make_pipeline(MinMaxScaler(), LogisticRegressionCV(penalty=penalty, Cs=np.logspace(-5, 5, 11),
                                                               scoring=metric, solver="liblinear", cv=10, refit=True))
    # Fit the model to the training data
    lr_model.fit(features, target)

    # Create a dataframe of feature coefficients and feature names
    feature_importance = pd.DataFrame(lr_model[1].coef_.reshape(-1), columns=["Importance"])
    feature_importance["Feature"] = features.columns
    feature_importance.sort_values(by=['Importance'], inplace=True)

    # Create a bar chart of the feature importances
    fig = px.bar(feature_importance, x='Feature', y='Importance', title=f"Feature Selection by {penalty.upper()} Penalized Logistic Regression (maximizing {metric})")
    fig.show()

In [6]:
feature_importance_logistic_regression(df.drop(["TARGET_5Yrs"], axis=1), df["TARGET_5Yrs"], "recall", "l2")

In [7]:
def feature_importance_tree_model(features, target, tree_model):
    """
    this function takes the features and the target and the metric to maximize and returns a bar chart of the feature importance

    :param features: the features of the dataset
    :param target: the target of the dataset
    :param metric: the metric to maximize (default: accuracy_score)

    :return: a bar chart of the feature importance
    """
    tree_model.fit(features, target)

    feature_importance = pd.DataFrame({'Feature': features.columns, 'Importance': tree_model.feature_importances_})
    feature_importance.sort_values(by=['Importance'], inplace=True)
    
    fig = px.bar(feature_importance, x='Feature', y='Importance', title=f"Feature Selection by default {tree_model.__class__.__name__}")
    fig.show()

In [8]:
feature_importance_tree_model(df.drop(["TARGET_5Yrs"], axis=1), df["TARGET_5Yrs"], xgb.XGBClassifier())

In [9]:
feature_importance_tree_model(df.drop(["TARGET_5Yrs"], axis=1), df["TARGET_5Yrs"], RandomForestClassifier())