https://docs.google.com/document/d/1ebHbuh1LYHeSom_eWGgCTAFnttJSVFRkqNXKVwiXh7Q/edit?usp=sharing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from collections import Counter

In [2]:
class KNNClassifier:
    def __init__(self, k=3, weighted=True, distance_metric='euclidean'):
        """
        Initialize KNN classifier
        
        Parameters:
        -----------
        k : int
            Number of neighbors to consider
        distance_metric : str
            Distance metric to use ('euclidean' or 'manhattan')
        """
        self.k = k
        self.weighted = weighted
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        """
        Fit the model using training data
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            Training data
        y : array-like of shape (n_samples,)
            Target values
        """
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        
    def _calculate_distance(self, x1, x2):
        """
        Calculate distance between two points
        
        Parameters:
        -----------
        x1, x2 : array-like
            Points to calculate distance between
        
        Returns:
        --------
        float : Distance between points
        """
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        else:
            raise ValueError("Unsupported distance metric")
    
    def predict(self, X):
        """
        Predict class labels for samples in X
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict
        
        Returns:
        --------
        array : Predicted class labels
        """
        X = np.array(X)
        predictions = []
        
        for x in X:
            # Calculate distances to all training samples
            distances = [self._calculate_distance(x, x_train) 
                        for x_train in self.X_train]
            
            # Get indices of k nearest neighbors
            k_nearest_indices = np.argsort(distances)[:self.k]
            
            # Get labels of k nearest neighbors
            k_nearest_labels = self.y_train[k_nearest_indices]
            
            # Majority vote
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])
            
        return np.array(predictions)
    
    def score(self, X, y):
        """
        Calculate accuracy score
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            Test samples
        y : array-like of shape (n_samples,)
            True labels for X
        
        Returns:
        --------
        float : Accuracy score
        """
        predictions = self.predict(X)
        return np.mean(predictions == y)

In [3]:
def preprocess_data(df):
    # Create target variable (1 for Dead, 0 for Alive)
    df['Target'] = (df['Status'] == 'Dead').astype(int)
    
    # Select relevant features
    features = ['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', 
                'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status', 
                'Progesterone Status', 'Regional Node Examined', 
                'Reginol Node Positive', 'Survival Months']
    
    # Handle missing values
    numeric_features = ['Age', 'Tumor Size', 'Regional Node Examined', 
                       'Reginol Node Positive', 'Survival Months']
    categorical_features = ['Race', 'Marital Status', 'T Stage ', 'N Stage', 
                          'Grade', 'A Stage', 'Estrogen Status', 'Progesterone Status']
    
    # Create imputers
    num_imputer = SimpleImputer(strategy='median')
    cat_imputer = SimpleImputer(strategy='most_frequent')
    
    # Apply imputation
    df[numeric_features] = num_imputer.fit_transform(df[numeric_features])
    df[categorical_features] = cat_imputer.fit_transform(df[categorical_features])
    
    # Encode categorical variables
    le = LabelEncoder()
    for col in categorical_features:
        df[col] = le.fit_transform(df[col])
    
    # Scale numeric features
    scaler = StandardScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])
    
    X = df[features]
    y = df['Target']
    
    # Apply PCA since we have many features after encoding
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    X_pca = pca.fit_transform(X)
    
    print(f"Reduced dimensions from {X.shape[1]} to {X_pca.shape[1]} features")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
    
    return X_pca, y

In [4]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }
    
    return results

In [5]:
def main():
    # Load data
    df = pd.read_csv('Breast_Cancer_dataset.csv')
    
    # Preprocess data
    X, y = preprocess_data(df)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize models
    models = {
        'Naive Bayes': (GaussianNB(), "Simple probabilistic classifier based on Bayes' theorem"),
        'Decision Tree': (DecisionTreeClassifier(random_state=42), "Tree-based model with clear decision paths"),
        'Random Forest': (RandomForestClassifier(random_state=42), "Ensemble of decision trees"),
        'Gradient Boosting': (GradientBoostingClassifier(random_state=42), "Sequential ensemble learning"),
        'Neural Network': (MLPClassifier(random_state=42), "Multi-layer perceptron neural network"),
        'KNN': (KNNClassifier(k=5, weighted=True), "Custom k-Nearest Neighbors with distance weighting")
    }
    
    # Train and evaluate models
    results = []
    for name, (model, description) in models.items():
        result = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, name)
        results.append(result)
    
    # Hyperparameter tuning for Random Forest, Neural Network, and KNN
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    
    nn_params = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    }
    
    knn_params = {
        'k': [3, 5, 7, 9],
        'weighted': [True, False]
    }
    
    # Perform hyperparameter tuning
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
    nn_grid = RandomizedSearchCV(MLPClassifier(random_state=42), nn_params, n_iter=10, cv=5)
    
    # Manual grid search for KNN since it's a custom class
    best_knn_score = 0
    best_knn_params = {}
    
    for k in knn_params['k']:
        for weighted in knn_params['weighted']:
            knn = KNNClassifier(k=k, weighted=weighted)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            
            if score > best_knn_score:
                best_knn_score = score
                best_knn_params = {'k': k, 'weighted': weighted}
    
    rf_grid.fit(X_train, y_train)
    nn_grid.fit(X_train, y_train)
    
    # Print results
    print("\nModel Performance:")
    results_df = pd.DataFrame(results)
    print(results_df.round(3))
    
    print("\nBest Random Forest Parameters:")
    print(rf_grid.best_params_)
    print(f"Best Random Forest Score: {rf_grid.best_score_:.3f}")
    
    print("\nBest Neural Network Parameters:")
    print(nn_grid.best_params_)
    print(f"Best Neural Network Score: {nn_grid.best_score_:.3f}")
    
    print("\nBest KNN Parameters:")
    print(best_knn_params)
    print(f"Best KNN Score: {best_knn_score:.3f}")

In [6]:
if __name__ == "__main__":
    main()

Reduced dimensions from 13 to 9 features
Explained variance ratio: [0.24694035 0.14706519 0.13474281 0.12199825 0.11927216 0.08462663
 0.05003909 0.03537286 0.02136303]





Model Performance:
               Model  Accuracy  Precision  Recall  F1 Score
0        Naive Bayes     0.873      0.612   0.408     0.490
1      Decision Tree     0.861      0.536   0.500     0.517
2      Random Forest     0.899      0.810   0.425     0.557
3  Gradient Boosting     0.901      0.794   0.450     0.574
4     Neural Network     0.896      0.790   0.408     0.538
5                KNN     0.882      0.698   0.367     0.481

Best Random Forest Parameters:
{'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
Best Random Forest Score: 0.886

Best Neural Network Parameters:
{'hidden_layer_sizes': (50,), 'alpha': 0.0001, 'activation': 'tanh'}
Best Neural Network Score: 0.895

Best KNN Parameters:
{'k': 5, 'weighted': True}
Best KNN Score: 0.882


