# Random Forest


In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import multiprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import BayesianRidge
from sklearn.experimental import enable_iterative_imputer  # Enable IterativeImputer

# Load the data
data = pd.read_csv('data/training_data.csv', delimiter=';')
test_data = pd.read_csv('data/test_data_no_target.csv', delimiter=';')

# Convert all columns except the first one to numeric in both train and test data
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

for column in test_data.columns[1:]:
    if test_data[column].dtype == 'object':
        test_data[column] = test_data[column].str.replace(",", ".").astype(float)

# One-hot encode the 'Group' column
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
data_encoded = encoder.fit_transform(data[["Group"]])
data = pd.concat([data_encoded, data], axis=1).drop(columns="Group")

test_data_encoded = encoder.transform(test_data[["Group"]])
test_data = pd.concat([test_data_encoded, test_data], axis=1).drop(columns="Group")

# Define numerical columns
numerical_col = data.columns[1:-2]

# Custom transformer for MICE imputation
class CustomMICEImputer(BaseEstimator, TransformerMixin):
    def __init__(self, top_n_features=3):
        self.top_n_features = top_n_features
        self.imputers = {}
    
    def fit(self, X, y=None):
        self.columns = X.columns
        self.feature_correlations = self._calculate_feature_correlations(X)
        
        for col in self.columns:
            top_features = self._get_top_n_correlated_features(X, col, self.top_n_features)
            imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=15, random_state=0)
            imputer.fit(X[top_features + [col]])
            self.imputers[col] = (imputer, top_features)

        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            imputer, top_features = self.imputers[col]
            X_transformed[[col]] = imputer.transform(X_transformed[top_features + [col]])[:, -1:]
        return X_transformed
    
    def _calculate_feature_correlations(self, X):
        return X.corr().abs()
    
    def _get_top_n_correlated_features(self, X, feature, n):
        correlations = self._calculate_feature_correlations(X)[feature].drop(feature)
        top_features = correlations.nlargest(n).index.tolist()
        return top_features

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', CustomMICEImputer(3)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_col)
    ]
)

# Model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define custom scoring function
def custom_scorer(clf, X, y):
    predictions = clf.predict(X)
    cost_matrix = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
    conf_matrix = np.zeros((3, 3))

    for true, pred in zip(y, predictions):
        conf_matrix[int(true) + 1][int(pred) + 1] += 1

    error_cost = np.sum(conf_matrix * cost_matrix) / len(y)
    return -error_cost  # Negative because higher is better for RandomizedSearchCV

# Parameter grid for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(150, 200),
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [20, 25, 27, 30],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': randint(10, 20),
    'classifier__max_features': ['log2'],
    'classifier__bootstrap': [True]
}

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100,
                                   scoring=custom_scorer, cv=3, verbose=1, random_state=1, 
                                   n_jobs=os.cpu_count() - 1, error_score='raise')

# Prepare training data
X_train = data.drop(['Class', 'Perform'], axis=1)
y_train = data['Class']

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Access the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = random_search.best_score_

# Predict on test data
test_predictions = best_model.predict(test_data)

# Save predictions to a text file
np.savetxt('submission.txt', test_predictions, fmt='%d')

print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")
print(f"Predictions saved to 'submission.txt'")


Fitting 3 folds for each of 100 candidates, totalling 300 fits




Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 30, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 18, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 199}
Best score: -0.8754983083978719
Predictions saved to 'submission.txt'


# CatBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import randint, uniform
from sklearn.model_selection import train_test_split
from sklearn import set_config
from catboost import CatBoostClassifier, CatBoostRegressor

data = pd.read_csv('data/training_data.csv', delimiter=';')
test_data = pd.read_csv('data/test_data_no_target.csv', delimiter=';')

# Convert all columns except the first one to numeric
for column in data.columns[1:]:
    if data[column].dtype == 'object':
        data[column] = data[column].str.replace(",", ".").astype(float)

X = data.iloc[:, :-2]
y = data["Class"]

# Convert all columns except the first one to numeric in the test data
for column in test_data.columns[1:]:
    if test_data[column].dtype == 'object':
        test_data[column] = test_data[column].str.replace(",", ".").astype(float)

numerical_col = data.columns[1:-2]
categorical_col = [data.columns[0]] 

class GroupImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, strategy, numerical_cols):
        self.group_col = group_col
        self.strategy = strategy
        self.numerical_cols = numerical_cols
        self.imputer_dict_ = {}

    def fit(self, X, y=None):
        for group, group_data in X.groupby(self.group_col):
            if self.strategy == 'mean':
                self.imputer_dict_[group] = group_data[self.numerical_cols].mean()
            elif self.strategy == 'median':
                self.imputer_dict_[group] = group_data[self.numerical_cols].median()
            elif self.strategy == 'mode':
                self.imputer_dict_[group] = group_data[self.numerical_cols].mode().iloc[0]
            else:
                raise ValueError(f"Unknown strategy {self.strategy}")
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for group, group_data in X_transformed.groupby(self.group_col):
            imputer_values = self.imputer_dict_.get(group)
            if imputer_values is not None:
                for col in self.numerical_cols:
                    X_transformed.loc[group_data.index, col] = group_data[col].fillna(imputer_values[col])
        return X_transformed

class CustomNumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, numerical_cols, strategy):
        self.group_col = group_col
        self.numerical_cols = numerical_cols
        self.strategy = strategy
        self.group_imputer = GroupImputer(group_col=group_col, strategy=strategy, numerical_cols=numerical_cols)
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.group_imputer.fit(X, y)
        self.scaler.fit(X[self.numerical_cols])
        return self

    def transform(self, X):
        X = self.group_imputer.transform(X)
        X[self.numerical_cols] = self.scaler.transform(X[self.numerical_cols])
        return X

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_col)
    ],
    remainder='passthrough'  # To keep the categorical columns
)

class GroupAwareTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, numerical_cols, strategy):
        self.group_col = group_col
        self.numerical_cols = numerical_cols
        self.strategy = strategy
        self.numerical_transformer = CustomNumericalTransformer(group_col, numerical_cols, strategy)

    def fit(self, X, y=None):
        self.numerical_transformer.fit(X, y)
        return self

    def transform(self, X):
        X_transformed = self.numerical_transformer.transform(X)
        return X_transformed

    def get_params(self, deep=True):
        return {
            "group_col": self.group_col,
            "numerical_cols": self.numerical_cols,
            "strategy": self.strategy,
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        self.numerical_transformer = CustomNumericalTransformer(self.group_col, self.numerical_cols, self.strategy)
        return self

# Pipeline that includes the custom GroupAwareTransformer
pipeline_classifier = Pipeline(steps=[
    ('preprocessor', GroupAwareTransformer(group_col='Group', numerical_cols=numerical_col, strategy='median')),
    ('classifier', CatBoostClassifier(random_state=42, verbose=False, cat_features=[0]))
])

def custom_predict_proba(probas):
    predictions = []
    for p in probas:
        if p[0] >= 0.5:  # p[0] is the probability for class -1
            predictions.append(-1)
        elif p[2] >= 0.5:  # p[2] is the probability for class 1
            predictions.append(1)
        else:  # p[1] is the probability for class 0
            predictions.append(0)
    return np.array(predictions)

def custom_scorer(clf, X, y):
    probas = clf.predict_proba(X)
    predictions = custom_predict_proba(probas)
    cost_matrix = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
    conf_matrix = np.zeros((3, 3))
    for true, pred in zip(y, predictions):
        conf_matrix[int(true) + 1][int(pred) + 1] += 1
    error_cost = np.sum(conf_matrix * cost_matrix) / len(y)
    return -error_cost  # Negative because higher is better for RandomizedSearchCV

param_dist_classifier = {
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__depth': [4, 6, 8, 10],
    'classifier__iterations': [100, 200, 300],
    'classifier__l2_leaf_reg': [1, 3, 5, 7, 9],
    'classifier__min_data_in_leaf': [1, 3, 5, 7, 10],
    'classifier__max_bin': [128, 256, 512],
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__bootstrap_type': ['Bernoulli'],
    'classifier__early_stopping_rounds': [20, 30, 40, 50]
}

random_search_classifier = RandomizedSearchCV(pipeline_classifier, param_distributions=param_dist_classifier, n_iter=10,
                                               scoring=custom_scorer, cv=5, verbose=1, random_state=42, n_jobs=-1, error_score='raise')

random_search_classifier.fit(X, y)

best_model_classifier = random_search_classifier.best_estimator_
best_params_classifier = random_search_classifier.best_params_
best_score_classifier = random_search_classifier.best_score_

pipeline_regressor = Pipeline(steps=[
    ('preprocessor', GroupAwareTransformer(group_col='Group', numerical_cols=numerical_col, strategy='median')),
    ('regressor', CatBoostRegressor(random_state=42, verbose=False, cat_features=[0]))  
])

param_dist_regressor = {
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__depth': [4, 6, 8, 10],
    'regressor__iterations': [100, 200, 300],
    'regressor__l2_leaf_reg': [1, 3, 5, 7, 9],
    'regressor__min_data_in_leaf': [1, 3, 5, 7, 10],
    'regressor__max_bin': [128, 256, 512],
    'regressor__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'regressor__bootstrap_type': ['Bernoulli'],
    'regressor__early_stopping_rounds': [20, 30, 40, 50]
}

random_search_regressor = RandomizedSearchCV(pipeline_regressor, param_distributions=param_dist_regressor, n_iter=10,
                                              scoring='neg_mean_squared_error', cv=5, verbose=1, random_state=42, n_jobs=-1, error_score='raise')

random_search_regressor.fit(X, data['Perform'])

best_model_regressor = random_search_regressor.best_estimator_
best_params_regressor = random_search_regressor.best_params_
best_score_regressor = random_search_regressor.best_score_

df_classifier = pd.DataFrame(custom_predict_proba(best_model_classifier.predict_proba(test_data)), columns=['Predictions'])

df_regressor = pd.DataFrame(best_model_regressor.predict(test_data), columns=['Predictions'])

df_classifier.to_csv('data/submit_classifier.csv', index=False, header=False)
df_regressor.to_csv('data/submit_regressor.csv', index=False, header=False)

print("Classifier Predictions:")
print(df_classifier.head())
print("\nRegressor Predictions:")
print(df_regressor.head())
print("\nBest Parameters (Classifier):", best_params_classifier)
print("Best Score (Classifier):", best_score_classifier)
print("\nBest Parameters (Regressor):", best_params_regressor)
print("Best Score (Regressor):", best_score_regressor)