In [69]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#import kaggle_evaluation.mcts_inference_server
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

df = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv",
                low_memory=False)
concepts = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/concepts.csv",
                low_memory=False)
test = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv",
                low_memory=False)
sample_submission = pd.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv",
                low_memory=False)

/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/concepts.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/__init__.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_inference_server.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/templates.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/base_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/relay.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evalua

In [71]:
def preprocess_data(df):
    """
    Performs transformations on df and returns transformed df.
    """
    # Convert integer columns to float
    int_columns = df.select_dtypes(include='int').columns
    df[int_columns] = df[int_columns].astype(float)

    # Initialize a list to store columns to drop
    columns_to_drop = []

    # Iterate over columns and their content
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum() == len(content):  # Entire column is NaN
                print(f"Column '{label}' is entirely NaN, dropping it.")
                columns_to_drop.append(label)
        else:
            if pd.isnull(content).sum() == len(content):  # Entire column is NaN
                print(f"Column '{label}' is entirely NaN, dropping it.")
                columns_to_drop.append(label)

    # Drop the columns that are entirely NaN
    df.drop(columns=columns_to_drop, inplace=True)

    return df

# Turn categorical variables data into numbers and fill messing
def encode_categories(df):
    label_encoder = LabelEncoder()
    for label, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
            df[label] = label_encoder.fit_transform(df[label])
    return df

# *Normalization and standardization*
def scale_data(df):
    # Create a StandardScaler object
    scaler = StandardScaler()
    # Apply standardization to numeric columns and convert back to DataFrame
    df_scaled = scaler.fit_transform(df)

    # Convert the NumPy array back to a DataFrame, retaining the original column names
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

    return df_scaled

# a function that receive a df and split out data into training/validation sets
def split(df):
    # Sample data (features and target)
    X = df.drop(columns=['utility_agent1', 'Id'])  # Features
    y = df['utility_agent1']               # Target

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Optionally, you can print the shapes to verify the split
    print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
    print(f"Training target shape: {y_train.shape}, Validation target shape: {y_val.shape}")
    
    return X_train, X_val, y_train, y_val


# Process the training data
df = preprocess_data(df)    
df = encode_categories(df)
df_scaled = scale_data(df)

X_train, X_val, y_train, y_val = split(df)
#X_train, X_val, y_train, y_val = split(df_scaled)


Training data shape: (186587, 794), Validation data shape: (46647, 794)
Training target shape: (186587,), Validation target shape: (46647,)


In [72]:
df.shape

(233234, 796)

In [73]:
test.shape

(3, 810)

In [74]:
# Modeling
# Fit the model on the best found hyperparameters

best_params = {
    'n_estimators': 20,
    'min_samples_split': 18,
    'min_samples_leaf': 1,
    'max_samples': 10000,
    'max_features': None,
    'max_depth': None
}

# Instantiate the model with the best hyperparameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_samples=best_params['max_samples'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    n_jobs=-1,
    random_state=42
)

# Fit the model to the training data
best_rf_model.fit(X_train, y_train)





## Training Evaluation
def rmsle(y_test, y_preds):
    # Add a small constant to avoid taking the log of zero
    y_test = np.maximum(y_test, 0)
    y_preds = np.maximum(y_preds, 0)
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

def show_scores(model):
    train_preds = model.predict(X_train)
    valid_preds = model.predict(X_val)
    scores = {
        "Training MAE": mean_absolute_error(y_train, train_preds),
        "Valid MAE": mean_absolute_error(y_val, valid_preds),
        "Training RMSLE": rmsle(y_train, train_preds),
        "Valid RMSLE": rmsle(y_val, valid_preds),
        "Training R2": r2_score(y_train, train_preds),
        "Valid R2": r2_score(y_val, valid_preds)
    }
    return scores

show_scores(best_rf_model)

{'Training MAE': 0.0006497250222094305,
 'Valid MAE': 0.0006807145440770274,
 'Training RMSLE': 0.0038681119386722454,
 'Valid RMSLE': 0.004422532266769469,
 'Training R2': 0.9998573848346392,
 'Valid R2': 0.9998425866787627}

# Submission

*now i have my model trained

> i will preprocess the test data

> then add to it the unavailable labels

> then make prediction on it*

In [75]:
'''find the median of the 3 columns 
- num_draws_agent1
- num_losses_agent1
- num_wins_agent1

then fill the test file with the medians by creating new columns''' 

# Calculate the median values for the missing labels
median_wins = df['num_wins_agent1'].median()
median_draws = df['num_draws_agent1'].median()
median_losses = df['num_losses_agent1'].median()

# Create the columns and fill with median values from the training data
test['num_wins_agent1'] = median_wins
test['num_draws_agent1'] = median_draws
test['num_losses_agent1'] = median_losses

# Process the test data
X_test = preprocess_data(test)    
X_test = encode_categories(test)

# Extract the 'Id' column from X_test and store it
Id = X_test['Id']

# Drop the 'Id' column from X_test
X_test = X_test.drop(columns=['Id'])

Column 'Behaviour' is entirely NaN, dropping it.
Column 'StateRepetition' is entirely NaN, dropping it.
Column 'Duration' is entirely NaN, dropping it.
Column 'Complexity' is entirely NaN, dropping it.
Column 'BoardCoverage' is entirely NaN, dropping it.
Column 'GameOutcome' is entirely NaN, dropping it.
Column 'StateEvaluation' is entirely NaN, dropping it.
Column 'Clarity' is entirely NaN, dropping it.
Column 'Decisiveness' is entirely NaN, dropping it.
Column 'Drama' is entirely NaN, dropping it.
Column 'MoveEvaluation' is entirely NaN, dropping it.
Column 'StateEvaluationDifference' is entirely NaN, dropping it.
Column 'BoardSitesOccupied' is entirely NaN, dropping it.
Column 'BranchingFactor' is entirely NaN, dropping it.
Column 'DecisionFactor' is entirely NaN, dropping it.
Column 'MoveDistance' is entirely NaN, dropping it.
Column 'PieceNumber' is entirely NaN, dropping it.
Column 'ScoreDifference' is entirely NaN, dropping it.


In [85]:
X_test.shape, X_train.shape

((3, 794), (186587, 794))

In [88]:
# Load the sample_submission.csv file provided by Kaggle
sample_submission = pd.read_csv('/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv')

# Generate predictions
y_pred = best_rf_model.predict(X_test)

# Limit the predictions between -1 and 1 to meet the competition requirements
y_pred = np.clip(y_pred, -1.0, 1.0)

# Replace the 'utility_agent1' column with the predictions
sample_submission['utility_agent1'] = y_pred  

# Export the predictions to a CSV file for submission
sample_submission.to_csv('sample_submission.csv', index=False)

print("Submission file successfully exported with the name 'submission.parquet'")

Submission file successfully exported with the name 'submission.parquet'


In [89]:
sample_submission

Unnamed: 0,Id,utility_agent1
0,233234,-0.003333
1,233235,-0.003333
2,233236,-0.003333
