<a href="https://colab.research.google.com/github/boteny02/prostate_me/blob/main/Copy_of_Prostate_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PSA DATA FOR PROSTATE CANCER PREDICTION

# Importing Necessary libraries

In [None]:
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # Import ConfusionMatrixDisplay
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading Dataset From Drive

In [None]:
Path ="/content/drive/MyDrive/Prostate cancer/extracted Values of Lab Test Results PSA.xlsx"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_excel(Path)
print(df.head())

In [None]:
df.columns


In [None]:
df.describe()
df['Age']

In [None]:
from itertools import count
# Assuming your DataFrame is named 'df' and the column with string values is 'Test Value'
# Replace '>100', '<0.1', etc., with appropriate numerical representations

df['Test Value'] = pd.to_numeric(df['Test Value'], errors='coerce')  # Convert to numeric, invalid parsing will be set as NaN
xdf=df[df['Test Value']>100]

# Now, if you want to replace NaN values with something like the mean:
#df['Test Value'].fillna(df['Test Value'].mean(), inplace=True)  # Replace NaN with the column's mean

#print(xdf)
print(df.head(10))


In [None]:
df['Test Value'].fillna(df['Test Value'].mean(), inplace=True)  # Replace NaN with the column's mean


In [None]:
# Select features and target variable
X = df.drop(columns=["With In Normal"])  # Assuming "With In Normal" is the target
y = df["With In Normal"].apply(lambda x: 1 if x == "Yes" else 0)  # Convert to binary labels

In [None]:
# Normalize numerical features using Min-Max Scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train Test Split of the Dataset

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define of the base model and meta model

In [None]:
# Define base models (Random Forest + XGBoost)
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier( eval_metric='logloss', random_state=42))
]

# Meta-model
meta_model = LogisticRegression()

# Training Stack Model

In [None]:
# Train Stacking Model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=10)
stacked_model.fit(X_train, y_train)

# Evaluation of Each Base Model

In [None]:
# Evaluate each base model separately
for name, model in base_models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds) # Use preds instead of y_pred
    print("Confusion Matrix:")
    print(cm)
    print(f"Model {name} Accuracy: {acc*100:.2f}%")
    dis = ConfusionMatrixDisplay(confusion_matrix=cm)
    dis.plot(cmap="Accent")
    #plt.show()

In [None]:
# prompt: classification report

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# ... (Your existing code) ...

# Evaluate each base model separately
for name, model in base_models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print("Confusion Matrix:")
    print(cm)
    print(f"Model {name} Accuracy: {acc*100:.2f}%")
    print(classification_report(y_test,preds)) #added classification report
    dis = ConfusionMatrixDisplay(confusion_matrix=cm)
    dis.plot(cmap="Accent")
    #plt.show()


# Evaluation of the Stack Model

In [None]:
# Predictions
y_pred = stacked_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacked Ensemble Model Accuracy: {accuracy*100:.2f}%")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Display the matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")

In [None]:
# prompt: averaging

# Calculate the average accuracy of the base models
base_model_accuracies = []
for name, model in base_models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    base_model_accuracies.append(acc)

average_accuracy = sum(base_model_accuracies) / len(base_model_accuracies)
print(f"Average accuracy of base models: {average_accuracy*100:.2f}%")


In [None]:
# prompt: majority voting

import numpy as np

def majority_voting(y_pred_rf, y_pred_xgb, y_pred_stacked):
  """
  Performs majority voting among three prediction arrays.

  Args:
      y_pred_rf: Predictions from the Random Forest model.
      y_pred_xgb: Predictions from the XGBoost model.
      y_pred_stacked: Predictions from the stacked model.

  Returns:
      A NumPy array representing the majority vote predictions.
  """

  # Combine predictions into a single array
  combined_predictions = np.column_stack((y_pred_rf, y_pred_xgb, y_pred_stacked))

  # Perform majority voting
  majority_votes = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=combined_predictions)

  return majority_votes


# Example usage (assuming you have the predictions from your models):
# Replace these with your actual predictions from the trained models
y_pred_rf = stacked_model.predict(X_test) #Example only, use actual predictions from rf model
y_pred_xgb = stacked_model.predict(X_test) #Example only, use actual predictions from xgb model
y_pred_stacked = stacked_model.predict(X_test) #Example only, use actual predictions from stacked model


# Perform majority voting
majority_predictions = majority_voting(y_pred_rf, y_pred_xgb, y_pred_stacked)

# Evaluate the majority voting results
accuracy = accuracy_score(y_test, majority_predictions)
print(f"Majority Voting Accuracy: {accuracy*100:.2f}%")

cm = confusion_matrix(y_test, majority_predictions)
print("Confusion Matrix:")
print(cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")

