In [11]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import  GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [12]:


train_set = pd.read_csv('/kaggle/input/smoking/trainClean.csv')
test_set = pd.read_csv('/kaggle/input/smoking/testClean.csv')
val_set = pd.read_csv('/kaggle/input/smoking/valClean.csv')

In [13]:
X_train = train_set.iloc[:, :-1]
y_train = train_set.iloc[:, -1]
X_test = test_set.iloc[:, :-1]
y_test = test_set.iloc[:, -1]
X_val = val_set.iloc[:, :-1]
y_val = val_set.iloc[:, -1]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix


train_set = pd.read_csv('/kaggle/input/smoking/trainClean.csv')
test_set = pd.read_csv('/kaggle/input/smoking/testClean.csv')
val_set = pd.read_csv('/kaggle/input/smoking/valClean.csv')

X_train = train_set.iloc[:, :-1]
y_train = train_set.iloc[:, -1]
X_test = test_set.iloc[:, :-1]
y_test = test_set.iloc[:, -1]
X_val = val_set.iloc[:, :-1]
y_val = val_set.iloc[:, -1]
# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
}

# Use SMOTE for handling imbalanced data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create a custom scoring metric (e.g., F1-score) for cross-validation
scoring = {'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)}

# Create the StratifiedKFold cross-validator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring=scoring,
                           cv=cv, refit='f1_score', verbose=2, n_jobs=-1)

# Fit the model using the resampled training set
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Evaluate performance metrics on the validation set
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

# Print results on the validation set
print("Best Hyperparameters:", best_params)
print("Precision on Validation Set:", precision)
print("Recall on Validation Set:", recall)
print("F1 Score on Validation Set:", f1)

# Calculate and print accuracy on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
print("Accuracy on Validation Set:", accuracy_val)

# Print confusion matrix on the validation set
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix on Validation Set:")
print(conf_matrix_val)

# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate performance metrics on the test set
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

# Print results on the test set
print("\nResults on Test Set:")
print("Precision on Test Set:", precision_test)
print("Recall on Test Set:", recall_test)
print("F1 Score on Test Set:", f1_test)

# Calculate and print accuracy on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
print("Accuracy on Test Set:", accuracy_test)

# Print confusion matrix on the test set
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix on Test Set:")
print(conf_matrix_test)


In [14]:
num_negative_class = sum(y_train == 0)
num_positive_class = sum(y_train == 1)

class_ratio = num_negative_class / num_positive_class

print(f'Class Ratio: {class_ratio}')

Class Ratio: 1.7253914988814318


In [15]:

# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7, 15],
    'learning_rate': [10, 0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200, 250],
    'scale_pos_weight': [class_ratio, 2, 3, 5]
}

# Create the GridSearchCV object with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

# Print confusion matrix and classification report for validation set
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix (Validation):\n", val_conf_matrix)

val_classification_report = classification_report(y_val, y_val_pred)
print("Classification Report (Validation):\n", val_classification_report)

# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

# Print confusion matrix and classification report for test set
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix (Test):\n", test_conf_matrix)

test_classification_report = classification_report(y_test, y_test_pred)
print("Classification Report (Test):\n", test_classification_report)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 250, 'scale_pos_weight': 2}
Validation Accuracy: 0.8126645918123054
Confusion Matrix (Validation):
 [[4313  970]
 [ 595 2476]]
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.88      0.82      0.85      5283
           1       0.72      0.81      0.76      3071

    accuracy                           0.81      8354
   macro avg       0.80      0.81      0.80      8354
weighted avg       0.82      0.81      0.81      8354

Test Accuracy: 0.8089537945894183
Confusion Matrix (Test):
 [[4314  960]
 [ 636 2444]]
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.87      0.82      0.84      5274
           1       0.72      0.79      0.75      3080

    accuracy                           0.81      8354
   macro avg       0.79      0.81      0.80      8354
weighted avg       0.81      0.81      0.