In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from IPython.display import display
 
# Load the datasets

training_data_path = "2021plus_data_v1.csv"
prediction_data_path = "2024_data_v4.csv"
 
training_data = pd.read_csv(training_data_path)
prediction_data = pd.read_csv(prediction_data_path)
 
# Preprocess training data
label_encoder_conf = LabelEncoder()
training_data['CONF'] = label_encoder_conf.fit_transform(training_data['CONF'])
 
label_encoder_postseason = LabelEncoder()
training_data['POSTSEASON'] = label_encoder_postseason.fit_transform(training_data['POSTSEASON'])
 
X = training_data.drop(['Season', 'TeamName', 'POSTSEASON'], axis=1)
y = training_data['POSTSEASON']
 
# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
 
# Training the RandomForest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)
 
# Making predictions on the test set
y_pred = rf_model.predict(X_test_scaled)
 
# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy}")
 
# Adjusting the prediction process for the 2024 dataset
common_features = prediction_data.columns.intersection(X.columns).tolist()
X_train_common = X_train[common_features]
X_test_common = X_test[common_features]
 
# Re-scaling the features based on the common features set
scaler_common = StandardScaler()
X_train_common_scaled = scaler_common.fit_transform(X_train_common)
X_test_common_scaled = scaler_common.transform(X_test_common)
 
# Retraining the RandomForest model with the adjusted features
rf_model_common = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model_common.fit(X_train_common_scaled, y_train)
 
# Preparing the 2024 prediction data with the common features and appropriate scaling
prediction_data_processed = prediction_data[common_features]
prediction_data_scaled = scaler_common.transform(prediction_data_processed)
 
# Making initial predictions with the retrained RandomForest model
initial_predictions_common = rf_model_common.predict_proba(prediction_data_scaled)
 
# Convert initial predictions to a DataFrame
predictions_common_df = pd.DataFrame(initial_predictions_common, columns=label_encoder_postseason.classes_)
predictions_common_df['TeamName'] = prediction_data['TeamName']
 
# Function to adjust predictions to ensure uniqueness and meet stage constraints
# Corrected function to adjust predictions to ensure uniqueness and meet stage constraints
def adjust_predictions_to_meet_constraints(predictions_df, constraints):
    final_assignments = pd.DataFrame(columns=['TeamName', 'PredictedStage'])
    assigned_teams = []
 
    for stage, count in constraints.items():
        available_predictions = predictions_df[~predictions_df['TeamName'].isin(assigned_teams)]
        top_teams_for_stage = available_predictions.nlargest(count, stage)[['TeamName']]
        top_teams_for_stage['PredictedStage'] = stage
        
        # Corrected approach for appending rows to the DataFrame
        final_assignments = pd.concat([final_assignments, top_teams_for_stage], ignore_index=True)
        
        assigned_teams.extend(top_teams_for_stage['TeamName'].tolist())
 
    # Ensure the final assignments are in a sensible order (optional, for readability)
    final_assignments['StageOrder'] = final_assignments['PredictedStage'].apply(lambda x: list(constraints.keys()).index(x))
    final_assignments = final_assignments.sort_values('StageOrder').drop('StageOrder', axis=1).reset_index(drop=True)
 
    return final_assignments
 
# Updated constraints without R68
updated_constraints = {'Champions': 1, '2ND': 1, 'F4': 2, 'E8': 4, 'S16': 8, 'R32': 16, 'R64': 32,}
 
# Call the new adjustment function
final_adjusted_predictions = adjust_predictions_to_meet_constraints(predictions_common_df, updated_constraints)
 
# Display the adjusted predictions
display(final_adjusted_predictions)
 
# Example: Write to CSV
final_adjusted_predictions.to_csv("final_predictions.csv", index=False)

Model accuracy: 0.4146341463414634


Unnamed: 0,TeamName,PredictedStage
0,Connecticut,Champions
1,Arizona,2ND
2,Florida,F4
3,Houston,F4
4,Iowa St.,E8
...,...,...
59,Morehead St.,R64
60,South Dakota St.,R64
61,BYU,R64
62,Utah St.,R64
