Training a Random Forest Model for Feature Importance Plots under EDA

In [18]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import pandas as pd


# Loading the dataframe
df = pd.read_csv("../data/processed/indiv_stats_avg.csv")   


# Define features and target variable
X = df.drop(columns=["playerName", "season_year", "teamTricode", "allStar"]) # Remove non-numeric and target column
y = df["allStar"]


# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


# Save the trained model
os.makedirs("models", exist_ok=True)
model_path = os.path.join("..", "models", "allstar_model_new.pkl")
joblib.dump(model, model_path)


['../models/allstar_model_new.pkl']