In [1]:
#Question 1: Loading and Preprocessing

In [2]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

# Check for missing values
print(df.isnull().sum())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('target', axis=1))
y = df['target']


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [3]:
#Question 2: Classification Algorithm Implementation

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
results_df.sort_values(by='F1 Score', ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.973684,0.972222,0.985915,0.979021
3,SVM,0.973684,0.972222,0.985915,0.979021
2,Random Forest,0.964912,0.958904,0.985915,0.972222
1,Decision Tree,0.947368,0.957746,0.957746,0.957746
4,k-NN,0.947368,0.957746,0.957746,0.957746


In [5]:
# Question 3: Model Comparison 

	Model	Accuracy	Precision	Recall	F1 Score
0	Logistic Regression	0.973684	0.972222	0.985915	0.979021
3	SVM	0.973684	0.972222	0.985915	0.979021
2	Random Forest	0.964912	0.958904	0.985915	0.972222
1	Decision Tree	0.947368	0.957746	0.957746	0.957746
4	k-NN	0.947368	0.957746	0.957746	0.957746
## Best Performing Model: Random Forest 
It gave the highest F1 Score and performed well across all metrics. It's robust and handles non-linear data well.

## Worst Performing Model: Decision Tree 
It had the lowest F1 Score, likely due to overfitting and lack of regularization.

## Question 4: Conclusion

In this classification task, I used five different models to predict breast cancer diagnosis. After preprocessing and training the models, the **Random Forest Classifier** gave the best results, showing high accuracy and F1 Score.

The **Decision Tree Classifier** had the weakest performance, likely because it overfits the training data. This assignment helped me understand how different classification models work and how important it is to scale data and compare results with proper metrics.
