In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import torch.nn as nn 
import torch
from torch.utils.data import Dataset, DataLoader

In [54]:
df = pd.read_csv('train.csv')

df.shape

(177, 18)

In [60]:
def load_and_process_data(filepath):
    # Load data
    df = pd.read_csv(filepath)
    
    # Replace 'null' strings with NaN
    df.replace('null', np.nan, inplace=True)

    # Encode target variable with fixed classes
    def encode_winner(row):
        if row['f_boxer_result'] == "won":
            return 0  # First boxer wins
        elif row['f_boxer_result'] == "lost":
            return 1  # Second boxer wins
        else:
            return 2  # Draw

    df['winner_encoded'] = df.apply(encode_winner, axis=1)
    print(df['winner_encoded'])
    
    # Convert numeric columns to appropriate data types
    numeric_columns = [
        'f_boxer_age', 'f_boxer_height', 'f_boxer_reach',
        'f_boxer_won', 'f_boxer_lost', 'f_boxer_KOs',
        's_boxer_age', 's_boxer_height', 's_boxer_reach',
        's_boxer_won', 's_boxer_lost', 's_boxer_KOs',
        'matchRounds'
    ]
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Encode categorical variables
    label_encoders = {}
    categorical_columns = ['f_boxer_result', 'fightEnd']
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    

    # Drop boxers' names
    df = df.drop(['f_boxer', 's_boxer'], axis=1)

    
    # Features and target
    X = df.drop(['winner', 'winner_encoded'], axis=1).values
    y = df['winner_encoded'].values
    
    return X, y

In [61]:
x_train, y_train = load_and_process_data('train.csv')

0      0
1      0
2      0
3      0
4      0
      ..
172    0
173    0
174    0
175    0
176    0
Name: winner_encoded, Length: 177, dtype: int64


In [62]:
unique, counts = np.unique(y_train, return_counts=True)
print(unique, counts)
train_class_distribution = dict(zip(unique, counts))
print("Training Class Distribution:", train_class_distribution)

[0 1 2] [159   4  14]
Training Class Distribution: {np.int64(0): np.int64(159), np.int64(1): np.int64(4), np.int64(2): np.int64(14)}


In [63]:
x_val, y_val = load_and_process_data('validation.csv')
unique, counts = np.unique(y_val, return_counts=True)
print(unique, counts)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     1
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
Name: winner_encoded, dtype: int64
[0 1] [19  1]
