In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import NearMiss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report
from collections import Counter
import numpy as np

# Load dataset
dataset = pd.read_csv('/content/drive/My Drive/labelled_data.csv')

# Fill missing values
dataset.update(dataset[['NSCLC', 'SCLC']].fillna(0))

# Selecting features and labels
features = dataset.iloc[:, [2, 3, 5]]
label_sclc = dataset['SCLC']
label_nsclc = dataset['NSCLC']

# Applying NearMiss Algorithm
near_miss = NearMiss()
print(f'SCLC Original dataset shape: {Counter(label_sclc)}')
features_sclc, target_sclc = near_miss.fit_resample(features, label_sclc)
print(f'SCLC Resampled dataset shape: {Counter(target_sclc)}')

print(f'NSCLC Original dataset shape: {Counter(label_nsclc)}')
features_nsclc, target_nsclc = near_miss.fit_resample(features, label_nsclc)
print(f'NSCLC Resampled dataset shape: {Counter(target_nsclc)}')

# Splitting data into training and testing sets
x_train_sclc, x_test_sclc, y_train_sclc, y_test_sclc = train_test_split(features_sclc, target_sclc, test_size=0.2, random_state=42)
x_train_nsclc, x_test_nsclc, y_train_nsclc, y_test_nsclc = train_test_split(features_nsclc, target_nsclc, test_size=0.2, random_state=42)

# Standardizing features
scaler_sclc = StandardScaler()
x_train_sclc = scaler_sclc.fit_transform(x_train_sclc)
x_test_sclc = scaler_sclc.transform(x_test_sclc)

scaler_nsclc = StandardScaler()
x_train_nsclc = scaler_nsclc.fit_transform(x_train_nsclc)
x_test_nsclc = scaler_nsclc.transform(x_test_nsclc)

SCLC Original dataset shape: Counter({0.0: 18857, 1.0: 921})
SCLC Resampled dataset shape: Counter({0.0: 921, 1.0: 921})
NSCLC Original dataset shape: Counter({0.0: 19087, 1.0: 691})
NSCLC Resampled dataset shape: Counter({0.0: 691, 1.0: 691})


In [8]:
# Hyperparameter grid setup for Random Forest
hyperparameters = {
    'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Randomized Search for Hyperparameter Tuning for SCLC
rf_classifier_sclc = RandomForestClassifier()
random_search_sclc = RandomizedSearchCV(
    estimator=rf_classifier_sclc, param_distributions=hyperparameters, n_iter=100, cv=3, verbose=2, n_jobs=-1
)
random_search_sclc.fit(x_train_sclc, y_train_sclc)
optimal_params_sclc = random_search_sclc.best_params_

# RandomForest with Tuned Parameters for SCLC
rf_optimized_sclc = RandomForestClassifier(**optimal_params_sclc)
rf_optimized_sclc.fit(x_train_sclc, y_train_sclc)
predictions_sclc = rf_optimized_sclc.predict(x_test_sclc)
print("\n=== Classification Report for SCLC ===")
print(classification_report(y_test_sclc, predictions_sclc))

# Randomized Search for Hyperparameter Tuning for NSCLC
rf_classifier_nsclc = RandomForestClassifier()
random_search_nsclc = RandomizedSearchCV(
    estimator=rf_classifier_nsclc, param_distributions=hyperparameters, n_iter=100, cv=3, verbose=2, n_jobs=-1
)
random_search_nsclc.fit(x_train_nsclc, y_train_nsclc)
optimal_params_nsclc = random_search_nsclc.best_params_

# RandomForest with Tuned Parameters for NSCLC
rf_optimized_nsclc = RandomForestClassifier(**optimal_params_nsclc)
rf_optimized_nsclc.fit(x_train_nsclc, y_train_nsclc)
predictions_nsclc = rf_optimized_nsclc.predict(x_test_nsclc)
print("\n=== Classification Report for NSCLC ===")
print(classification_report(y_test_nsclc, predictions_nsclc))

Fitting 3 folds for each of 100 candidates, totalling 300 fits

=== Classification Report for SCLC ===
              precision    recall  f1-score   support

         0.0       0.82      0.85      0.83       188
         1.0       0.84      0.80      0.82       181

    accuracy                           0.83       369
   macro avg       0.83      0.83      0.83       369
weighted avg       0.83      0.83      0.83       369

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
  warn(



=== Classification Report for NSCLC ===
              precision    recall  f1-score   support

         0.0       0.79      0.90      0.84       142
         1.0       0.88      0.74      0.80       135

    accuracy                           0.82       277
   macro avg       0.83      0.82      0.82       277
weighted avg       0.83      0.82      0.82       277

