<a href="https://colab.research.google.com/github/arushi2601/Predictive_Analytics/blob/main/PA_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Step 1: Preprocessing

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy import stats

# Load dataset
data = pd.read_csv('Breast_Cancer_dataset.csv')

# Separate numeric and categorical columns
numeric_data = data.select_dtypes(include=[np.number])
nonnumeric_data = data.select_dtypes(exclude=[np.number])

# Step 1: Handle Missing Values
# Impute numeric columns with median
imputer_missing_numeric = SimpleImputer(strategy="median")
numeric_data_imputed = pd.DataFrame(imputer_missing_numeric.fit_transform(numeric_data), columns=numeric_data.columns)

# Impute categorical columns with most frequent value
imputer_missing_nonnumeric = SimpleImputer(strategy="most_frequent")
nonnumeric_data_imputed = pd.DataFrame(imputer_missing_nonnumeric.fit_transform(nonnumeric_data), columns=nonnumeric_data.columns)


for column in nonnumeric_data_imputed.columns:
    nonnumeric_data_imputed[column] = LabelEncoder().fit_transform(nonnumeric_data_imputed[column])

# Merge numeric and categorical data back
data_imputed = pd.concat([numeric_data_imputed, nonnumeric_data_imputed], axis=1)

# Step 2: Outlier Detection and Removal
z_scores = np.abs(stats.zscore(data_imputed.select_dtypes(include=[np.number])))
threshold = 3
data_no_outliers = data_imputed[(z_scores < threshold).all(axis=1)]

# Step 3: Feature Scaling
scaler = StandardScaler()
data_standardized = pd.DataFrame(scaler.fit_transform(data_no_outliers), columns=data_no_outliers.columns)

# Step 4: Dimensionality Reduction
pca = PCA(n_components=0.95)
data_reduced = pd.DataFrame(pca.fit_transform(data_standardized))

# Split data into features and target variable
X = data_reduced
y = data_no_outliers['Survival Months']


In [4]:
# Step 2: Performing feature selection

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from collections import defaultdict

# Load the dataset
#data = pd.read_csv('/content/Breast_Cancer_dataset.csv')

# Preprocessing and encoding
data_cleaned = data.dropna()  # Drop rows with missing values
X = data_cleaned.drop(columns=['Status', 'Survival Months'])
y = data_cleaned['Status']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_encoded = pd.get_dummies(X, drop_first=True)

# Feature selection (top 10 features)
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_encoded, y_encoded)
selected_feature_names = X_encoded.columns[selector.get_support()]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42)
model_performance = defaultdict(dict)

# 1: K-Nearest Neighbors (KNN)
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for x in X_test:
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            predictions.append(np.bincount(k_nearest_labels).argmax())
        return predictions


knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
model_performance["KNN"]["accuracy"] = accuracy_score(y_test, knn_predictions)
model_performance["KNN"]["classification_report"] = classification_report(y_test, knn_predictions, target_names=label_encoder.classes_)
#Pros: Easy to understand and implement; good for smaller datasets or those with simple structures.
#Cons: Computationally expensive on large datasets; sensitive to feature scaling and irrelevant features.
#Main Hyperparameters:
#k: Number of nearest neighbors to consider (typically chosen through cross-validation).
#distance_metric: Often Euclidean, though Manhattan distance is also common.

# 2: Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_test)
model_performance["Naive Bayes"]["accuracy"] = accuracy_score(y_test, nb_predictions)
model_performance["Naive Bayes"]["classification_report"] = classification_report(y_test, nb_predictions, target_names=label_encoder.classes_)

#Pros: Fast, efficient, and performs well on smaller datasets; works well with high-dimensional data.
#Cons: Assumes feature independence, which is rarely true in practice; less effective with complex data relationships.
#Main Hyperparameters:
#var_smoothing: A parameter to add smoothing to variances, preventing overfitting to minor variations in data.

# 3: C4.5 Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_test)
model_performance["Decision Tree"]["accuracy"] = accuracy_score(y_test, dt_predictions)
model_performance["Decision Tree"]["classification_report"] = classification_report(y_test, dt_predictions, target_names=label_encoder.classes_)
#Pros: Simple to interpret, especially with small trees; captures non-linear relationships well.
#Cons: Prone to overfitting without pruning; sensitive to small changes in data.
#Main Hyperparameters:
#max_depth: Maximum depth of the tree (limits tree size to prevent overfitting).
#min_samples_split: Minimum number of samples required to split a node.
#criterion: Function to measure split quality (e.g., Gini or entropy).


# 4: Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
model_performance["Random Forest"]["accuracy"] = accuracy_score(y_test, rf_predictions)
model_performance["Random Forest"]["classification_report"] = classification_report(y_test, rf_predictions, target_names=label_encoder.classes_)
#Pros: Resistant to overfitting; can capture complex interactions between features.
#Cons: Less interpretable due to the large number of trees; slower training and prediction times compared to single models.
#Main Hyperparameters:
#n_estimators: Number of decision trees in the ensemble.
#max_depth: Maximum depth of each individual tree.
#min_samples_split: Minimum samples required to split an internal node.

# 5: Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_predictions = gb.predict(X_test)
model_performance["Gradient Boosting"]["accuracy"] = accuracy_score(y_test, gb_predictions)
model_performance["Gradient Boosting"]["classification_report"] = classification_report(y_test, gb_predictions, target_names=label_encoder.classes_)
#Pros: High performance, particularly on structured/tabular data; capable of capturing complex relationships.
#Cons: Prone to overfitting with insufficient regularization; requires careful tuning of hyperparameters.
#Main Hyperparameters:
#n_estimators: Number of trees added to the model (more trees generally improve performance up to a point).
#learning_rate: Step size for each iteration (smaller values provide better generalization).
#max_depth: Limits depth of each tree, reducing model complexity.

# Display the model performance
for model_name, metrics in model_performance.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Classification Report:\n{metrics['classification_report']}")




Model: KNN
Accuracy: 0.8253275109170306
Classification Report:
              precision    recall  f1-score   support

       Alive       0.85      0.96      0.90       385
        Dead       0.37      0.14      0.20        73

    accuracy                           0.83       458
   macro avg       0.61      0.55      0.55       458
weighted avg       0.78      0.83      0.79       458


Model: Naive Bayes
Accuracy: 0.8187772925764192
Classification Report:
              precision    recall  f1-score   support

       Alive       0.90      0.89      0.89       385
        Dead       0.43      0.45      0.44        73

    accuracy                           0.82       458
   macro avg       0.66      0.67      0.67       458
weighted avg       0.82      0.82      0.82       458


Model: Decision Tree
Accuracy: 0.7969432314410481
Classification Report:
              precision    recall  f1-score   support

       Alive       0.86      0.90      0.88       385
        Dead       0.31    

In [5]:
#Step 3: Hyperparamter tuning using Random forest and Gradient boosting
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grids for each model
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

# Random Forest Hyperparameter Tuning
rf = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best hyperparameters for Random Forest
rf_best_params = rf_grid_search.best_params_
rf_best_score = rf_grid_search.best_score_

# Gradient Boosting Hyperparameter Tuning
gb = GradientBoostingClassifier(random_state=42)
gb_grid_search = GridSearchCV(estimator=gb, param_grid=gb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

# Best hyperparameters for Gradient Boosting
gb_best_params = gb_grid_search.best_params_
gb_best_score = gb_grid_search.best_score_

# Print the best parameters and scores for each model
print("Best Random Forest Parameters:", rf_best_params)
print("Best Random Forest Cross-Validation Accuracy:", rf_best_score)

print("Best Gradient Boosting Parameters:", gb_best_params)
print("Best Gradient Boosting Cross-Validation Accuracy:", gb_best_score)


Best Random Forest Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest Cross-Validation Accuracy: 0.8476228763717039
Best Gradient Boosting Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best Gradient Boosting Cross-Validation Accuracy: 0.8443486547252126


In [15]:
#Step 4: Results

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Retrain models with best hyperparameters from step 3
# Random Forest with best parameters
best_rf = RandomForestClassifier(**rf_best_params, random_state=42)
best_rf.fit(X_train, y_train)
rf_best_predictions = best_rf.predict(X_test)
rf_best_accuracy = accuracy_score(y_test, rf_best_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_best_predictions)
rf_class_report = classification_report(y_test, rf_best_predictions, target_names=label_encoder.classes_)

# Gradient Boosting with best parameters
best_gb = GradientBoostingClassifier(**gb_best_params, random_state=42)
best_gb.fit(X_train, y_train)
gb_best_predictions = best_gb.predict(X_test)
gb_best_accuracy = accuracy_score(y_test, gb_best_predictions)
gb_conf_matrix = confusion_matrix(y_test, gb_best_predictions)
gb_class_report = classification_report(y_test, gb_best_predictions, target_names=label_encoder.classes_)

print("\nFinal Model Performance Summary:")
print(f"Random Forest Best Accuracy: {rf_best_accuracy}")
print(f"Gradient Boosting Best Accuracy: {gb_best_accuracy}")

print("\nConfusion Matrix for Random Forest:")
print(rf_conf_matrix)
print("\nClassification Report for Random Forest:")
print(rf_class_report)

print("\nConfusion Matrix for Gradient Boosting:")
print(gb_conf_matrix)
print("\nClassification Report for Gradient Boosting:")
print(gb_class_report)

print("\nImportant features used from Random Forest:")
feature_importances_rf = pd.Series(best_rf.feature_importances_, index=selected_feature_names).sort_values(ascending=False)
print(feature_importances_rf)

print("\nImportant Features used from Gradient Boosting:")
feature_importances_gb = pd.Series(best_gb.feature_importances_, index=selected_feature_names).sort_values(ascending=False)
print(feature_importances_gb)




Final Model Performance Summary:
Random Forest Best Accuracy: 0.8384279475982532
Gradient Boosting Best Accuracy: 0.851528384279476

Confusion Matrix for Random Forest:
[[374  11]
 [ 63  10]]

Classification Report for Random Forest:
              precision    recall  f1-score   support

       Alive       0.86      0.97      0.91       385
        Dead       0.48      0.14      0.21        73

    accuracy                           0.84       458
   macro avg       0.67      0.55      0.56       458
weighted avg       0.80      0.84      0.80       458


Confusion Matrix for Gradient Boosting:
[[384   1]
 [ 67   6]]

Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

       Alive       0.85      1.00      0.92       385
        Dead       0.86      0.08      0.15        73

    accuracy                           0.85       458
   macro avg       0.85      0.54      0.53       458
weighted avg       0.85      0.85      0.80       458


