In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load the data
clean_data = pd.read_csv('CleanData2.csv')
augmented_results = pd.read_csv('FinalDataSavedTrial.csv')

# Preprocess and merge data
augmented_results = augmented_results[augmented_results['OUTCOME'] != 'D/D']  # Remove 'D/D' outcomes
merged_data_f1 = augmented_results.merge(clean_data, left_on='Fighter1', right_on='Fighter Name')
merged_data = merged_data_f1.merge(clean_data, left_on='Fighter2', right_on='Fighter Name', suffixes=('_1', '_2'))

# Define features for the model
features = [
    'Age', 'Height', 'Reach', 'Sig. Strikes Landed/min', 'Striking Accuracy (%)', 
    'Sig. Strikes Absorbed/min', 'Striking Defense (%)', 'Takedown Avg/15min', 
    'Takedown Accuracy (%)', 'Takedown Defense (%)', 'Sub Attempt Avg/15min'
]
selected_features = [f"{feat}_1" for feat in features] + [f"{feat}_2" for feat in features]

# Prepare dataset for training
X = merged_data[selected_features]
y = merged_data['OUTCOME']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Apply polynomial feature transformation
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Define and fit models
models = {
    'random_forest': RandomForestClassifier(n_estimators=300, random_state=42),
    'logistic_regression': LogisticRegression(),
    'gaussian_nb': GaussianNB(),
    'mlp_classifier': MLPClassifier(max_iter=300, random_state=42)
}

# Train models and store predictions
models_encoded_predictions = {}
for name, model in models.items():
    model.fit(X_train_poly, y_train_encoded)
    pred_encoded = model.predict(X_test_poly)
    models_encoded_predictions[name] = pred_encoded
    print(f"{name} accuracy: {accuracy_score(y_test_encoded, pred_encoded):.4f}")

# Blending predictions using weighted averages
weights = {
    'random_forest': 0.3,
    'logistic_regression': 0.25,
    'gaussian_nb': 0.2,
    'mlp_classifier': 0.25
}

# Calculate blended accuracy
final_encoded_prediction = np.zeros_like(list(models_encoded_predictions.values())[0], dtype=float)
for name, weight in weights.items():
    final_encoded_prediction += weight * (models_encoded_predictions[name] == y_test_encoded).astype(int)
final_encoded_prediction = np.where(final_encoded_prediction / sum(weights.values()) >= 0.5, 1, 0)
blended_encoded_accuracy = accuracy_score(y_test_encoded, final_encoded_prediction)
print(f"Blended Model Accuracy: {blended_encoded_accuracy:.4f}")


random_forest accuracy: 0.3636
logistic_regression accuracy: 0.5455
gaussian_nb accuracy: 0.5455
mlp_classifier accuracy: 0.3636
Blended Model Accuracy: 0.6364


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the data
clean_data = pd.read_csv('CleanData2.csv')
augmented_results = pd.read_csv('FinalDataSavedTrial.csv')

# Feature Engineering: Adding difference-based features
augmented_results = augmented_results.merge(clean_data.add_suffix('_1'), left_on='Fighter1', right_on='Fighter Name_1')
augmented_results = augmented_results.merge(clean_data.add_suffix('_2'), left_on='Fighter2', right_on='Fighter Name_2')

# Example of a new feature: Age difference between fighters
augmented_results['age_difference'] = augmented_results['Age_1'] - augmented_results['Age_2']

# Define features and labels
features = ['age_difference', 'Height_1', 'Height_2', 'Reach_1', 'Reach_2', 'Sig. Strikes Landed/min_1', 'Sig. Strikes Landed/min_2']
X = augmented_results[features]
y = augmented_results['OUTCOME'].apply(lambda x: 1 if x.split('/')[0] == 'W' else 0)

# Data scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model: XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, predictions)

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_accuracy = accuracy_score(y_test, best_model.predict(X_test))
print("Best Accuracy:", best_accuracy)


Best Accuracy: 0.7272727272727273


In [3]:
import joblib
joblib.dump(xgb, 'xgb_fight_model.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']