In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import torch.nn as nn 
import torch
from torch.utils.data import Dataset, DataLoader

In [16]:
df = pd.read_csv('train.csv')

df.shape

(177, 18)

In [25]:
def load_and_process_data(filepath):
    # Load data
    df = pd.read_csv(filepath)
    
    # Replace 'null' strings with NaN
    df.replace('null', np.nan, inplace=True)
    
    # Convert numeric columns to appropriate data types
    numeric_columns = [
        'f_boxer_age', 'f_boxer_height', 'f_boxer_reach',
        'f_boxer_won', 'f_boxer_lost', 'f_boxer_KOs',
        's_boxer_age', 's_boxer_height', 's_boxer_reach',
        's_boxer_won', 's_boxer_lost', 's_boxer_KOs',
        'matchRounds'
    ]
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Encode categorical variables
    label_encoders = {}
    categorical_columns = ['f_boxer_result', 'fightEnd']
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    # Encode target variable with fixed classes
    def encode_winner(row):
        if row['winner'] == row['f_boxer']:
            return 0  # First boxer wins
        elif row['winner'] == row['s_boxer']:
            return 1  # Second boxer wins
        else:
            return 2  # Draw

    df['winner_encoded'] = df.apply(encode_winner, axis=1)

    # Drop boxers' names
    df = df.drop(['f_boxer', 's_boxer'], axis=1)

    
    # Features and target
    X = df.drop(['winner', 'winner_encoded'], axis=1).values
    y = df['winner_encoded'].values
    
    return X, y

In [26]:
x_train, y_train = load_and_process_data('train.csv')

In [33]:
unique, counts = np.unique(y_train, return_counts=True)
print(unique, counts)
train_class_distribution = dict(zip(unique, counts))
print("Training Class Distribution:", train_class_distribution)

[0 1 2] [168   4   5]
Training Class Distribution: {np.int64(0): np.int64(168), np.int64(1): np.int64(4), np.int64(2): np.int64(5)}


In [34]:
x_val, y_val = load_and_process_data('validation.csv')
unique, counts = np.unique(y_val, return_counts=True)
print(unique, counts)

[0 2] [ 2 18]


In [23]:
X.shape
y.shape
target_le.classes_

array(['Albert Onolunose', 'Antonio Duarte', 'Austin Trout',
       'Breyon Gorham', 'Brian Carlos Castano', 'Charles Baylor',
       'Chris Norrad', 'Corrie Sanders', 'Craig McEwan', 'Draw',
       'Emmanuel Medina', 'Erik Rafael Esquivel', 'Erislandy Lara',
       'Geovany Bruzon', 'Gilberto Ramirez', 'Hugo Lomeli',
       'Imran Haddabah', 'Jarrett Hurd', 'Jermall Charlo',
       'Joe Smith Jr.', 'Jordan Panthen', 'Josesito Lopez',
       'Juan Carlos Raygosa', 'Kubrat Pulev', 'Lamon Brewster',
       'Lawrence King', 'Louisbert Altidor', 'Marco Antonio Rubio',
       'Nagy Aguilera', 'Omar Ulises Huerta', 'Oscar Alan Perez',
       'Rafal Wolczecki', 'Rigoberto Alvarez', 'Rolando Soto',
       'Ruben Padilla', 'Santana Draper', 'Santiago Ramos',
       'Saul Alvarez', 'Serhii Bohachuk', 'Steve Geffrard', 'Todd Manuel',
       'Travis Scott', 'Tyson Fury', 'Victor Manuel Palacios',
       'Wladimir Klitschko'], dtype=object)