In [22]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score
import xgboost as xgb
from matplotlib import pyplot as plt
import seaborn as sns

In [23]:
df = pd.read_csv('EGFR_3.csv')

In [24]:
df['response'] = LabelEncoder().fit_transform(df['response'])

In [25]:
from imblearn.over_sampling import SMOTE

In [26]:
X = df.drop('response', axis=1)
y = df['response']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Initialize SMOTE
smote = SMOTE(random_state=42)

# Step 5: Fit and transform the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Step 6: Create a new DataFrame for the resampled data
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data['Response'] = y_resampled

# Step 7: Combine the original training data with the new resampled data
# Drop the original minority class instances from the training set before concatenation
X_train_combined = pd.concat([X_train[y_train == 0], resampled_data], ignore_index=True)
y_train_combined = pd.concat([y_train[y_train == 0], pd.Series(y_resampled)], ignore_index=True)

# Create a new DataFrame with combined features and target
final_train_data = pd.DataFrame(X_train_combined, columns=X.columns)
final_train_data['Response'] = y_train_combined

# Optional: Check the shape and class distribution of the new training set
print("Original training set shape:", X_train.shape, y_train.shape)
print("Resampled training set shape:", final_train_data.shape)
print("\nResampled class distribution:")
print(final_train_data['Response'].value_counts())

Original training set shape: (160, 22) (160,)
Resampled training set shape: (391, 23)

Resampled class distribution:
Response
0    109
2     94
3     94
1     94
Name: count, dtype: int64


In [28]:
model = xgb.XGBClassifier(random_state=42)

In [39]:
model.fit(df.drop('response', axis=1), df['response'])

In [40]:
y_prediction = model.predict(X_test)

In [41]:
accuracy_score(y_test, y_prediction)

1.0

In [42]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00         7

    accuracy                           1.00        41
   macro avg       1.00      1.00      1.00        41
weighted avg       1.00      1.00      1.00        41



In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 2, 3]
}

In [45]:
grid_model = GridSearchCV(model, param_grid=params, cv=3, n_jobs=-1, verbose=1)

In [46]:
grid_model.fit(final_train_data.drop('Response', axis=1), final_train_data['Response'])

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [47]:
grid_model.best_params_

{'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 4,
 'min_child_weight': 1,
 'n_estimators': 300}

In [48]:
grid_model.best_score_

0.9642787238207086