# **Feature Engineering**

In [None]:
import os
import numpy as np
import pandas as pd
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import os
os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_xla_devices=false"

# Load the pre-trained VGG16 model (without the top classification layer)
model = VGG16(weights='imagenet', include_top=False)

# Updated feature extraction function
def extract_image_features(image_path):
    try:
        if not os.path.exists(image_path):
            return np.zeros((7 * 7 * 512,))  # Default array matching VGG16 output
        img = image.load_img(image_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = model.predict(img_array,verbose=0)
        features_flattened = features.flatten()  # Ensure a consistent output shape
        
        return features_flattened
    except Exception as e:
        return np.zeros((7 * 7 * 512,))

In [None]:
# Define dataset directory
df = pd.read_csv("/kaggle/input/ocular-disease-recognition-odir5k/full_df.csv")

dataset_dir = "/kaggle/input/ocular-disease-recognition-odir5k/preprocessed_images/"

# Feature extraction for Left-Fundus and Right-Fundus
df['Left-Fundus Features'] = df['Left-Fundus'].apply(
    lambda x: extract_image_features(os.path.join(dataset_dir, x))
)
df['Right-Fundus Features'] = df['Right-Fundus'].apply(
    lambda x: extract_image_features(os.path.join(dataset_dir, x))
)

print("Working....")

# Convert features into numpy arrays
left_features = np.array(df['Left-Fundus Features'].to_list())
right_features = np.array(df['Right-Fundus Features'].to_list())

In [None]:
# Apply PCA
pca = PCA(n_components=50)
left_fundus_pca = pca.fit_transform(left_features)
right_fundus_pca = pca.fit_transform(right_features)

# Add PCA features back to the DataFrame
df['Left-Fundus PCA'] = list(left_fundus_pca)
df['Right-Fundus PCA'] = list(right_fundus_pca)

# Drop the original feature columns
df_final = df.drop(['Left-Fundus', 'Right-Fundus', 'Left-Fundus Features', 'Right-Fundus Features'], axis=1)

print(df_final.head())

In [None]:
# Check column names in the DataFrame
print(df.columns)


In [None]:
# Check for missing values in 'Patient Age'
print(df['Patient Age'].isnull().sum())


In [None]:
# Define age groups
bins = [0, 20, 40, 60, 100]  # Define the age groups
labels = ['0-20', '21-40', '41-60', '61+']
df['Age Group'] = pd.cut(df['Patient Age'], bins=bins, labels=labels)

# Encode 'Patient Sex' column (assuming 'Male' = 0, 'Female' = 1)
df['Gender'] = df['Patient Sex'].map({'Male': 0, 'Female': 1})

# Check for missing values in 'Patient Age'
print(df['Patient Age'].isnull().sum())

# Fill missing values with the mean of 'Patient Age'
df['Patient Age'].fillna(df['Patient Age'].mean(), inplace=True)

# Define age groups
bins = [0, 20, 40, 60, 100]  # Define the age groups
labels = ['0-20', '21-40', '41-60', '61+']
df['Age Group'] = pd.cut(df['Patient Age'], bins=bins, labels=labels)

# Encode 'Patient Sex' column (assuming 'Male' = 0, 'Female' = 1)
df['Gender'] = df['Patient Sex'].map({'Male': 0, 'Female': 1})



In [None]:
# Check the distribution of 'Age Group' and 'Gender' in relation to the 'target' (cataract diagnosis)
cataract_by_age_gender = df.groupby(['Age Group', 'Gender'])['target'].value_counts().unstack().fillna(0)

# Print the result
print(cataract_by_age_gender)


In [None]:
# Plotting age and gender distributions
# Example dataset assumption: 'Age' and 'Gender' columns exist
plt.figure(figsize=(8, 5))
sns.histplot(df_final['Patient Age'], bins=20, kde=True, color='blue')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

# Gender distribution
df_final['Gender'] = df_final['Patient Sex'].map({'Male': 0, 'Female': 1})
gender_counts = df_final['Gender'].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=gender_counts.index, y=gender_counts.values, palette="muted")
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

# Scatter plot of PCA components
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=df_final['Left-Fundus PCA'].apply(lambda x: x[0]),
    y=df_final['Right-Fundus PCA'].apply(lambda x: x[0]),
    hue=df_final['Gender'], palette="viridis", alpha=0.7
)
plt.title("Scatter Plot of PCA Features")
plt.xlabel("Left Fundus PCA Component 1")
plt.ylabel("Right Fundus PCA Component 1")
plt.show()

In [None]:
# Apply PCA
pca = PCA(n_components=50)
left_fundus_pca = pca.fit_transform(left_features)
right_fundus_pca = pca.fit_transform(right_features)

# Add PCA features back to the DataFrame
df['Left-Fundus PCA'] = list(left_fundus_pca)
df['Right-Fundus PCA'] = list(right_fundus_pca)

# Drop the original feature columns
df_final = df.drop(['Left-Fundus', 'Right-Fundus', 'Left-Fundus Features', 'Right-Fundus Features'], axis=1)

print(df_final.head())

# Plotting age and gender distributions
# Example dataset assumption: 'Age' and 'Gender' columns exist
plt.figure(figsize=(8, 5))
sns.histplot(df_final['Patient Age'], bins=20, kde=True, color='blue')
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

# Gender distribution
gender_counts = df_final['Gender'].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=gender_counts.index, y=gender_counts.values, palette="muted")
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

# Scatter plot of PCA components
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=df_final['Left-Fundus PCA'].apply(lambda x: x[0]), 
    y=df_final['Right-Fundus PCA'].apply(lambda x: x[0]),
    hue=df_final['Gender'], palette="viridis", alpha=0.7
)
plt.title("Scatter Plot of PCA Features")
plt.xlabel("Left Fundus PCA Component 1")
plt.ylabel("Right Fundus PCA Component 1")
plt.show()


# **Classification**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
         # print(os.path.join(dirname, filename))
        os.path.join(dirname, filename)

In [None]:
df = pd.read_csv("/kaggle/input/ocular-disease-recognition-odir5k/full_df.csv")
df.head(3)

In [None]:
def has_cataract(text):
    if "cataract" in text:
        return 1
    else:
        return 0

In [None]:
df["left_cataract"] = df["Left-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"] = df["Right-Diagnostic Keywords"].apply(lambda x: has_cataract(x))

In [None]:
df["left_cataract"] = df["Left-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"] = df["Right-Diagnostic Keywords"].apply(lambda x: has_cataract(x))

left_cataract = df.loc[(df.C ==1) & (df.left_cataract == 1)]["Left-Fundus"].values
left_cataract[:15]

right_cataract = df.loc[(df.C ==1) & (df.right_cataract == 1)]["Right-Fundus"].values
right_cataract[:15]

print("Number of images in left cataract: {}".format(len(left_cataract)))
print("Number of images in right cataract: {}".format(len(right_cataract)))

left_normal = df.loc[(df.C ==0) & (df["Left-Diagnostic Keywords"] == "normal fundus")]["Left-Fundus"].sample(250,random_state=42).values
right_normal = df.loc[(df.C ==0) & (df["Right-Diagnostic Keywords"] == "normal fundus")]["Right-Fundus"].sample(250,random_state=42).values
right_normal[:15]

In [None]:
cataract = np.concatenate((left_cataract,right_cataract),axis=0)
normal = np.concatenate((left_normal,right_normal),axis=0)
print(len(cataract),len(normal))

In [None]:
from tensorflow.keras.preprocessing.image import load_img,img_to_array
dataset_dir = "/kaggle/input/ocular-disease-recognition-odir5k/preprocessed_images/"
image_size=224
labels = []
dataset = []
def create_dataset(image_category,label):
    for img in tqdm(image_category):
        image_path = os.path.join(dataset_dir,img)
        try:
            image = cv2.imread(image_path,cv2.IMREAD_COLOR)
            image = cv2.resize(image,(image_size,image_size))
        except:
            continue
        
        dataset.append([np.array(image),np.array(label)])
    random.shuffle(dataset)
    return dataset

dataset = create_dataset(cataract,1)

len(dataset)

dataset = create_dataset(normal,0)

len(dataset)

In [None]:
plt.figure(figsize=(12,7))
for i in range(10):
    sample = random.choice(range(len(dataset)))
    image = dataset[sample][0]
    category = dataset[sample][1]
    if category== 0:
        label = "Normal"
    else:
        label = "Cataract"
    plt.subplot(2,5,i+1)
    plt.imshow(image)
    plt.xlabel(label)
plt.tight_layout()

In [None]:
from keras.applications import vgg19
from keras.models import Model
#from keras import optimizers
from scipy.optimize import fmin_l_bfgs_b
#from keras.applications.vgg19 import VGG19
#vgg19_weights = '../input/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
#vgg19 = VGG19(include_top = False, weights=vgg19_weights)
print(os.listdir("../input"))

In [None]:
x = np.array([i[0] for i in dataset]).reshape(-1,image_size,image_size,3)
y = np.array([i[1] for i in dataset])

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

from tensorflow.keras.applications.vgg19 import VGG19
# vgg = VGG19(weights="imagenet",include_top = False,input_shape=(image_size,image_size,3))
vgg = VGG19(weights='/kaggle/input/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',include_top = False,input_shape=(image_size,image_size,3))

for layer in vgg.layers:
    layer.trainable = False

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten,Dense
model = Sequential()
model.add(vgg)
model.add(Flatten())
model.add(Dense(1,activation="sigmoid"))

model.summary()

model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
checkpoint = ModelCheckpoint("vgg19.keras",monitor="val_accuracy",verbose=1,save_best_only=True,
                             save_weights_only=False,save_freq='epoch', mode='max')
earlystop = EarlyStopping( monitor="val_accuracy",patience=5,verbose=1)

history = model.fit(x_train,y_train,batch_size=32,epochs=15,validation_data=(x_test,y_test),
                    verbose=1,callbacks=[checkpoint,earlystop])

loss,accuracy = model.evaluate(x_test,y_test)
print("loss:",loss)
print("Accuracy:",accuracy)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import numpy as np

# Get predicted probabilities
y_pred_probs = model.predict(x_test)

# Convert probabilities to class labels
# If it's binary classification
y_pred = (y_pred_probs > 0.5).astype("int32")

# If it's multi-class classification, use np.argmax
# y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))

# Confusion matrix plot
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(conf_mat=cm, figsize=(8,7), class_names=["Normal", "Cataract"], show_normed=True)

In [None]:
import matplotlib.pyplot as plt

# Set style
plt.style.use("ggplot")

# Create figure
fig = plt.figure(figsize=(12, 6))

# Get number of epochs from the training history
epochs = range(1, len(history.history["accuracy"]) + 1)

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(epochs, history.history["accuracy"], "go-")
plt.plot(epochs, history.history["val_accuracy"], "ro-")
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Train", "Validation"], loc="upper left")

plt.show()

In [None]:
plt.subplot(1,2,2)
plt.plot(epochs,history.history["loss"],"go-")
plt.plot(epochs,history.history["val_loss"],"ro-")
plt.title("Model Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Train","val"],loc = "upper left")
plt.show()

In [None]:
plt.figure(figsize=(12,7))
for i in range(10):
    sample = random.choice(range(len(x_test)))
    image = x_test[sample]
    category = y_test[sample]
    pred_category = y_pred[sample]
    
    if category== 0:
        label = "Normal"
    else:
        label = "Cataract"
        
    if pred_category== 0:
        pred_label = "Normal"
    else:
        pred_label = "Cataract"
        
    plt.subplot(2,5,i+1)
    plt.imshow(image)
    plt.xlabel("Actual:{}\nPrediction:{}".format(label,pred_label))
plt.tight_layout()

In [None]:
import numpy as np
import pandas as pd
import cv2
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
import xgboost as xgb

# Preprocessing steps (image loading and dataset creation as in previous code)

# Load dataset CSV
df = pd.read_csv("/kaggle/input/ocular-disease-recognition-odir5k/full_df.csv")

# Function to detect cataract presence
def has_cataract(text):
    return 1 if "cataract" in text else 0

# Apply the function to detect cataract in both eyes
df["left_cataract"] = df["Left-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"] = df["Right-Diagnostic Keywords"].apply(lambda x: has_cataract(x))

# Separate images with cataract and normal images
left_cataract = df.loc[(df.C == 1) & (df.left_cataract == 1)]["Left-Fundus"].values
right_cataract = df.loc[(df.C == 1) & (df.right_cataract == 1)]["Right-Fundus"].values
left_normal = df.loc[(df.C == 0) & (df["Left-Diagnostic Keywords"] == "normal fundus")]["Left-Fundus"].sample(250, random_state=42).values
right_normal = df.loc[(df.C == 0) & (df["Right-Diagnostic Keywords"] == "normal fundus")]["Right-Fundus"].sample(250, random_state=42).values

# Concatenate left and right images
cataract = np.concatenate((left_cataract, right_cataract), axis=0)
normal = np.concatenate((left_normal, right_normal), axis=0)

# Image size and dataset directory
dataset_dir = "/kaggle/input/ocular-disease-recognition-odir5k/preprocessed_images/"
image_size = 224
dataset = []

# Function to create dataset
def create_dataset(image_category, label):
    for img in tqdm(image_category):
        image_path = os.path.join(dataset_dir, img)
        try:
            image = cv2.imread(image_path, cv2.IMREAD_COLOR)
            image = cv2.resize(image, (image_size, image_size))
            dataset.append([np.array(image), np.array(label)])
        except:
            continue
    random.shuffle(dataset)
    return dataset

# Prepare dataset with labels
dataset = create_dataset(cataract, 1)
dataset = create_dataset(normal, 0)

# Separate images and labels
x = np.array([i[0] for i in dataset]).reshape(-1, image_size, image_size, 3)
y = np.array([i[1] for i in dataset])

# Flatten image arrays for classical ML algorithms
x_flattened = x.reshape(x.shape[0], -1)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_flattened, y, test_size=0.2, random_state=42)

# Define classifiers to be used
classifiers = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "CatBoost": CatBoostClassifier(iterations=100, learning_rate=0.05, depth=5, verbose=0),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=5)
}

# Dictionary to store metrics
metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

In [None]:
# Train, predict, and evaluate each model
for model_name, model in classifiers.items():
    print(f"\nTraining {model_name}...")
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics
    metrics["Model"].append(model_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)
    metrics["F1-Score"].append(f1)
    
    # Display classification report
    print(f"{model_name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plot_confusion_matrix(conf_mat=cm, figsize=(6, 6), class_names=["Normal", "Cataract"], show_normed=True)
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()

# Convert metrics to DataFrame for comparison
metrics_df = pd.DataFrame(metrics)
print("\nComparative Analysis of Models:\n")
print(metrics_df)

# Plotting the comparative metrics
metrics_df.set_index("Model", inplace=True)
metrics_df.plot(kind="bar", figsize=(12, 8))
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(loc="upper right")
plt.show()