In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import xgboost as xgb

In [2]:
train = pd.read_csv("/kaggle/input/mic-contest-mining-mines/Train.csv")

In [3]:
def clean_numeric(cell_value):
    value_str = str(cell_value).strip()
    cleaned_value = ''.join([char for char in value_str if char.isdigit() or char == '.'])
    if cleaned_value.count('.') > 1:
        first_dot_index = cleaned_value.index('.')
        cleaned_value = cleaned_value[:first_dot_index + 1] + cleaned_value[first_dot_index + 1:].replace('.', '')
    return cleaned_value

In [4]:
def clean_and_convert_to_float(df):
    for col in df.columns:
        if col != 'Label':
            df[col] = df[col].apply(clean_numeric)
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [5]:
train = clean_and_convert_to_float(train)

In [6]:
X_train = train.drop(['??', 'Label'], axis=1)
Y_train = train['Label'].map({'R': 0, 'M': 1})

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [8]:
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'min_child_weight': [1, 3, 5],
    'scale_pos_weight': [1, 2, 5]
}

In [9]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [10]:
cv = StratifiedKFold(n_splits=5)

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                   scoring='accuracy', n_iter=20, cv=cv, n_jobs=-1, verbose=1)

random_search.fit(X_train, Y_train)

print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation accuracy: {random_search.best_score_:.2f}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'subsample': 1.0, 'scale_pos_weight': 1, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best cross-validation accuracy: 0.72


In [11]:
best_model = random_search.best_estimator_

In [12]:
test = pd.read_csv('/kaggle/input/mic-contest-mining-mines/Test.csv')
test = clean_and_convert_to_float(test)

In [13]:
X_test = test[['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10',
                'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 
                'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 
                'S26', 'S27', 'S28', 'S29', 'S30']]
X_test = scaler.transform(X_test)

Y_test = best_model.predict(X_test)

label_mapping = {0: 'R', 1: 'M'}
Y_test = np.vectorize(lambda label: label_mapping[label])(Y_test)

In [14]:
test['Label'] = Y_test
csv = test[['Id', "Label"]]
csv.to_csv("t4.csv", index=False)

print("Predictions saved to t4.csv")

Predictions saved to t4.csv
