In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.0 MB 2.2 MB/s eta 0:00:05
   - -------------------------------------- 0.3/11.0 MB 4.2 MB/s eta 0:00:03
   - -------------------------------------- 0.4/11.0 MB 3.4 MB/s eta 0:00:04
   -- ------------------------------------- 0.7/11.0 MB 3.9 MB/s eta 0:00:03
   --- ------------------------------------ 1.0/11.0 MB 4.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.2/11.0 MB 4.7 MB/s eta 0:00:03
   ----- ---------------------------------- 1.4/11.0 MB 4.8 MB/s eta 0:00:


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the dataset
cancer_data = load_breast_cancer()

# Convert to DataFrame
df = pd.DataFrame(data=cancer_data.data, columns=cancer_data.feature_names)

# Add the target column to the DataFrame
df['target'] = cancer_data.target

# Display the first few rows of the DataFrame
print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [6]:
from sklearn.preprocessing import StandardScaler  # Import StandardScaler
print("Missing values in the dataset:\n", df.isnull().sum())

# If there were missing values, you could handle them as needed:
# df.fillna(df.mean(), inplace=True)  # Example: Replace missing values with the mean

# Step 2: Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('target', axis=1))

# Convert the scaled features back to a DataFrame
df_scaled = pd.DataFrame(scaled_features, columns=cancer_data.feature_names)

# Add the target column back to the scaled DataFrame
df_scaled['target'] = df['target']

# Display the first few rows of the scaled DataFrame
print(df_scaled.head())

Missing values in the dataset:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64
   mean radius  mean texture  mean perimeter  mean area 

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Step 1: Preprocess the data (Feature Scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Implement Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Step 4: Make predictions and evaluate the model
y_pred = log_reg.predict(X_test)

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Logistic Regression: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy of Logistic Regression: 97.37%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Classifier Accuracy: {accuracy_dt * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Classifier Accuracy: 94.74%

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Classifier Accuracy: {accuracy_rf * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Classifier Accuracy: 96.49%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [10]:
from sklearn.svm import SVC

# Initialize and train the model
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Classifier Accuracy: {accuracy_svm * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

SVM Classifier Accuracy: 95.61%

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94        43
           1       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn_classifier.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-NN Classifier Accuracy: {accuracy_knn * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))

k-NN Classifier Accuracy: 94.74%

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [12]:
#Best-Performing Algorithm: Random Forest Classifier typically performs the best in terms of accuracy and generalization on the breast cancer dataset. 
# It reduces overfitting compared to Decision Trees and can handle non-linearity and complex feature interactions.

#Worst-Performing Algorithm: Decision Tree Classifier often performs the worst due to its tendency to overfit, especially without parameter tuning 
#or pruning.