In [29]:
import pandas as pd

# Load the data
data_fighters = pd.read_csv('FinalCleanedData.csv')
data_outcomes = pd.read_csv('Outcome.csv')

# Display the first few rows of each dataset to understand their structure
data_fighters.head()


Unnamed: 0.1,Unnamed: 0,Fighter Name,Age,Height,Reach,Overall Record,Record in Last 5,Sig. Strikes Landed/min,Striking Accuracy (%),Sig. Strikes Absorbed/min,...,KO Wins,KO Losses,Sub Wins,Sub Losses,Dec Wins,Dec Losses,DQ Wins,DQ Losses,Odds,Category
0,0,Aoriqileng,30,67,69.3,25-11-0,03-02,5.31,0.5,5.56,...,9,1,0,3,16,7,0,0,-108.0,Bantamweight Bout
1,1,Aoriqileng,30,67,69.3,25-11-0,03-02,5.31,0.5,5.56,...,9,1,0,3,16,7,0,0,195.0,Bantamweight Bout
2,2,Charalampos Grigoriou,31,67,67.0,08-03-0,03-02,13.0,0.65,4.0,...,6,1,0,0,2,2,0,0,-158.0,Bantamweight Bout
3,3,Aiemann Zahabi,36,68,68.0,10-02-0,03-02,3.04,0.43,3.4,...,5,1,3,0,2,1,0,0,500.0,Bantamweight Bout
4,4,Ricky Simon,31,66,70.0,20-04-0,04-01,3.01,0.44,3.23,...,6,2,4,1,10,1,0,0,-175.0,Bantamweight Bout


In [30]:
data_outcomes.head()

Unnamed: 0.1,Unnamed: 0,Fighter 1,Fighter 2,Outcome
0,0,Aoriqileng,Johnny Munoz,W/L
1,1,Charalampos Grigoriou,Chad Anheliger,L/W
2,2,Aiemann Zahabi,Javid Basharat,W/L
3,3,Ricky Simon,Mario Bautista,L/W
4,4,Casey O'Neill,Ariane Lipski,W/L


In [31]:
# Update the function to better handle "No Contest" notations in the records
def parse_record_better(record):
    parts = record.split('-')
    wins = int(parts[0])
    losses = int(parts[1].split(' ')[0])  # Modify to split and take the first part only
    ties = int(parts[2].split(' ')[0]) if len(parts) > 2 else 0
    return wins, losses, ties

data_fighters['Wins'], data_fighters['Losses'], data_fighters['Ties'] = zip(*data_fighters['Overall Record'].apply(parse_record_better))

# Drop the original 'Overall Record' column
data_fighters.drop(columns=['Overall Record'], inplace=True)

# Check for any missing values
missing_values = data_fighters.isnull().sum()

# Display the transformed data and missing values information
data_fighters.head(), missing_values[missing_values > 0]


(   Unnamed: 0           Fighter Name  Age  Height  Reach Record in Last 5  \
 0           0             Aoriqileng   30      67   69.3            03-02   
 1           1             Aoriqileng   30      67   69.3            03-02   
 2           2  Charalampos Grigoriou   31      67   67.0            03-02   
 3           3         Aiemann Zahabi   36      68   68.0            03-02   
 4           4            Ricky Simon   31      66   70.0            04-01   
 
    Sig. Strikes Landed/min  Striking Accuracy (%)  Sig. Strikes Absorbed/min  \
 0                     5.31                   0.50                       5.56   
 1                     5.31                   0.50                       5.56   
 2                    13.00                   0.65                       4.00   
 3                     3.04                   0.43                       3.40   
 4                     3.01                   0.44                       3.23   
 
    Striking Defense (%)  ...  Sub Losses 

In [32]:
# Fill missing values for 'Reach' with the median of the column
data_fighters['Reach'].fillna(data_fighters['Reach'].median(), inplace=True)

# Drop rows with missing 'Category' values
data_fighters.dropna(subset=['Category'], inplace=True)

# Re-check for any remaining missing values
remaining_missing_values = data_fighters.isnull().sum()

# Display the updated dataset and any remaining missing values
data_fighters.head(), remaining_missing_values[remaining_missing_values > 0]


(   Unnamed: 0           Fighter Name  Age  Height  Reach Record in Last 5  \
 0           0             Aoriqileng   30      67   69.3            03-02   
 1           1             Aoriqileng   30      67   69.3            03-02   
 2           2  Charalampos Grigoriou   31      67   67.0            03-02   
 3           3         Aiemann Zahabi   36      68   68.0            03-02   
 4           4            Ricky Simon   31      66   70.0            04-01   
 
    Sig. Strikes Landed/min  Striking Accuracy (%)  Sig. Strikes Absorbed/min  \
 0                     5.31                   0.50                       5.56   
 1                     5.31                   0.50                       5.56   
 2                    13.00                   0.65                       4.00   
 3                     3.04                   0.43                       3.40   
 4                     3.01                   0.44                       3.23   
 
    Striking Defense (%)  ...  Sub Losses 

In [34]:
# Create additional features
data_fighters['Win-Loss Ratio'] = data_fighters['Wins'] / (data_fighters['Losses'] + 1)  # Avoid division by zero
data_fighters['Strike Differential'] = data_fighters['Sig. Strikes Landed/min'] - data_fighters['Sig. Strikes Absorbed/min']
data_fighters['Experience'] = data_fighters['Wins'] + data_fighters['Losses'] + data_fighters['Ties']

try:
# Remove percentage signs and convert to numeric in 'Striking Defense (%)'
    data_fighters['Striking Defense (%)'] = data_fighters['Striking Defense (%)'].str.replace('%', '').astype(float)
except:
    pass
# Merge the fighter data with outcomes
# Create a merged dataset using the fighter names to join on fight outcomes
data_merged = pd.merge(data_outcomes, data_fighters, left_on='Fighter 1', right_on='Fighter Name', how='inner')
data_merged = pd.merge(data_merged, data_fighters, left_on='Fighter 2', right_on='Fighter Name', suffixes=('_1', '_2'), how='inner')

# Display the first few rows of the merged dataset and some of its columns to verify
data_merged.head(), data_merged.columns.tolist()


(   Unnamed: 0_x              Fighter 1       Fighter 2 Outcome  Unnamed: 0_y  \
 0             1  Charalampos Grigoriou  Chad Anheliger     L/W             2   
 1             1  Charalampos Grigoriou  Chad Anheliger     L/W             2   
 2             1  Charalampos Grigoriou  Chad Anheliger     L/W             2   
 3             1  Charalampos Grigoriou  Chad Anheliger     L/W             2   
 4             2         Aiemann Zahabi  Javid Basharat     W/L             3   
 
           Fighter Name_1  Age_1  Height_1  Reach_1 Record in Last 5_1  ...  \
 0  Charalampos Grigoriou     31        67     67.0              03-02  ...   
 1  Charalampos Grigoriou     31        67     67.0              03-02  ...   
 2  Charalampos Grigoriou     31        67     67.0              03-02  ...   
 3  Charalampos Grigoriou     31        67     67.0              03-02  ...   
 4         Aiemann Zahabi     36        68     68.0              03-02  ...   
 
    DQ Wins_2  DQ Losses_2  Odds_2  

In [35]:
# Check for any other columns containing percentage signs
percentage_columns = [col for col in data_merged.columns if data_merged[col].dtype == 'object' and data_merged[col].str.contains('%').any()]

# Convert percentage columns to float
for col in percentage_columns:
    data_merged[col] = data_merged[col].str.replace('%', '').astype(float) / 100

# Verify the conversion by displaying the updated data types and the first few rows
updated_types = data_merged[percentage_columns].dtypes
data_merged[percentage_columns].head(), updated_types


(Empty DataFrame
 Columns: []
 Index: [0, 1, 2, 3, 4],
 Series([], dtype: object))

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Select relevant features for both fighters
feature_columns = [
    'Age', 'Height', 'Reach', 'Wins', 'Losses', 'Ties', 'Win-Loss Ratio',
    'Sig. Strikes Landed/min', 'Sig. Strikes Absorbed/min', 'Strike Differential',
    'Striking Accuracy (%)', 'Striking Defense (%)', 'Experience', 'Odds'
]

# Build feature column names for Fighter 1 and Fighter 2
features_1 = [f"{col}_1" for col in feature_columns]
features_2 = [f"{col}_2" for col in feature_columns]
all_features = features_1 + features_2

# Define the target variable
target = 'Outcome'

# Filter the dataset for selected features and the target
X = data_merged[all_features]
y = data_merged[target]

# Encoding the Outcome as 0 or 1, assuming 'W/L' means Fighter 1 wins and Fighter 2 loses
y = y.apply(lambda x: 1 if x == 'W/L' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.9432314410480349

In [37]:
import joblib

# Save the RandomForest model
model_filename = 'fighter_prediction_model.pkl'
joblib.dump(rf_classifier, model_filename)

# Save the StandardScaler
scaler_filename = 'fighter_prediction_scaler.pkl'
joblib.dump(scaler, scaler_filename)

model_filename, scaler_filename


('fighter_prediction_model.pkl', 'fighter_prediction_scaler.pkl')