<a href="https://colab.research.google.com/github/aadhya2811/hackathon/blob/main/lung_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("borhanitrash/lung-cancer-ct-scan-dataset")

print("Path to dataset files:",path)

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import load_model
import cv2
import json
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from skimage import filters, segmentation, morphology
from skimage.feature import canny
from skimage.morphology import remove_small_objects
from scipy import ndimage
from skimage.feature import graycomatrix, graycoprops
import tensorflow as tf
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from tensorflow.keras.applications import ResNet50
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from skimage.measure import label, regionprops
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load pre-trained EfficientNetB0 for feature extraction
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Path to the dataset directory (update this with the correct path)
dataset_path = path
output_dir = "classified_images"
os.makedirs(output_dir, exist_ok=True)

def preprocess_image(image_path):
    """Load, enhance contrast, and apply Gaussian blur to the image."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    clahe_image = clahe.apply(image)
    blurred_image = cv2.GaussianBlur(clahe_image, (5, 5), 0)
    return blurred_image

def extract_features(image):
    """Extract handcrafted features: contrast, homogeneity, and area."""
    glcm = graycomatrix(image, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0, 0]
    area = np.sum(image > 0)
    return [contrast, homogeneity, int(area)]

def extract_cnn_features(images):
    """Extract features using EfficientNetB0."""
    images_efficientnet = np.array([cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) for img in images])
    images_efficientnet = np.array([cv2.resize(img, (224, 224)) for img in images_efficientnet])
    images_efficientnet = preprocess_input(images_efficientnet)
    cnn_features = base_model.predict(images_efficientnet)
    cnn_features = cnn_features.reshape(cnn_features.shape[0], -1)
    return cnn_features

def apply_tsne(features):
    """Apply t-SNE for dimensionality reduction."""
    tsne = TSNE(n_components=2, random_state=42)
    return tsne.fit_transform(features)

def cluster_and_save_images(images, features, image_paths):
    """Perform K-Means clustering and save images into respective folders."""
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    labels = kmeans.fit_predict(features)

    # Create directories for each cluster
    cluster_dirs = {
        0: os.path.join(output_dir, "cluster_0"),
        1: os.path.join(output_dir, "cluster_1"),
        2: os.path.join(output_dir, "cluster_2"),
    }
    for d in cluster_dirs.values():
        os.makedirs(d, exist_ok=True)

    # Save images in their respective folders
    for i, image_path in enumerate(image_paths):
        cluster_label = labels[i]
        filename = os.path.basename(image_path)
        save_path = os.path.join(cluster_dirs[cluster_label], filename)
        cv2.imwrite(save_path, images[i])

    print(f"Images successfully classified and saved into {output_dir}")

# Load and process dataset
all_features, all_images, image_paths = [], [], []

for root, _, files in os.walk(dataset_path):
    for file in files[:10]:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(root, file)
            preprocessed_image = preprocess_image(image_path)
            resized_image = cv2.resize(preprocessed_image, (224, 224))
            handcrafted_features = extract_features(resized_image)

            all_features.append(handcrafted_features)
            all_images.append(resized_image)
            image_paths.append(image_path)

# Convert to NumPy arrays
all_images = np.array(all_images)
cnn_features = extract_cnn_features(all_images)
handcrafted_features = np.array(all_features)

# Combine CNN and handcrafted features
combined_features = np.hstack((cnn_features, handcrafted_features))

# Apply t-SNE for dimensionality reduction
tsne_features = apply_tsne(combined_features)

# Perform clustering and save images
cluster_and_save_images(all_images, tsne_features, image_paths)


# Assuming you have a function to load images and labels
def load_data(dataset_path):
    images = []
    labels = []
    label_dict = {'cluster_0': 0, 'cluster_1': 1, 'cluster_2': 2}

    for cluster_dir in os.listdir(dataset_path):
        cluster_path = os.path.join(dataset_path, cluster_dir)
        if os.path.isdir(cluster_path):
            for image_file in os.listdir(cluster_path):
                image_path = os.path.join(cluster_path, image_file)
                image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
                image = cv2.resize(image, (224, 224))
                images.append(image)
                labels.append(label_dict[cluster_dir])

    return np.array(images), np.array(labels)

# Load data
dataset_path = "classified_images"
images, labels = load_data(dataset_path)

# Rename the original feature extraction function to avoid conflict
def extract_single_image_features(image):
    """Extract handcrafted features: contrast, homogeneity, and area."""
    glcm = graycomatrix(image, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0, 0]
    area = np.sum(image > 0)
    return [contrast, homogeneity, int(area)]

def extract_features(images):
    all_features = []
    for image in images:
        # Call the renamed function for single image feature extraction
        handcrafted_features = extract_single_image_features(image)
        all_features.append(handcrafted_features)
    return np.array(all_features)

# Extract handcrafted features
handcrafted_features = extract_features(images)

# Extract CNN features (assuming extract_cnn_features is defined elsewhere)
cnn_features = extract_cnn_features(images)

# Combine features
combined_features = np.hstack((cnn_features, handcrafted_features))

X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)
# Train SVM
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Predict
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# Train Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

import joblib

# Save the trained models
joblib.dump(svm_classifier, 'svm_classifier.pkl')
joblib.dump(rf_classifier, 'random_forest_classifier.pkl')

# Download them to your local system
from google.colab import files
files.download('svm_classifier.pkl')
files.download('random_forest_classifier.pkl')



Downloading from https://www.kaggle.com/api/v1/datasets/download/borhanitrash/lung-cancer-ct-scan-dataset?dataset_version_number=1...


100%|██████████| 119M/119M [00:02<00:00, 59.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/borhanitrash/lung-cancer-ct-scan-dataset/versions/1
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5s/step
Images successfully classified and saved into classified_images
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step
SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       1.00      1.00      1.00         4
           2       0.82      1.00      0.90         9

    accuracy                           0.90        20
   macro avg       0.94      0.90      0.91        20
weighted avg       0.92      0.90      0.90        20

SVM Accuracy: 0.9
Random Forest Classification Report:
              precision    recall  f1-score 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>