In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


Step 1: Load the Dataset

We will use the Breast Cancer Wisconsin dataset, which is available in sklearn.datasets.
This dataset is commonly used for binary classification tasks (malignant vs. benign tumors).

In [2]:
#  Load the Breast Cancer Dataset
cancer = load_breast_cancer()

In [3]:
# Convert to Pandas DataFrame
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target  # Add target variable (0 = Malignant, 1 = Benign)

This converts the dataset into a structured DataFrame, making it easier to manipulate.
The target column (malignant/benign) is added for classification tasks.

In [7]:
# Display dataset information
print("Dataset Shape:", df.shape)
print("\nFeature Names:", cancer.feature_names)
print("\nClass Labels:", cancer.target_names)

Dataset Shape: (569, 31)

Feature Names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Class Labels: ['malignant' 'benign']


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [11]:
df.drop_duplicates(inplace=True)

In [13]:
df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [15]:
# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


The Breast Cancer dataset does not have missing values in sklearn, so no imputation is needed.
If missing values were present, we could replace them using mean/median imputation.

In [17]:
# Basic statistics of dataset
print("\nDataset Statistics:\n", df.describe())


Dataset Statistics:
        mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000            

In [19]:
# Split data into Training & Testing Sets

X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
#standardisation

from sklearn.preprocessing import StandardScaler

object= StandardScaler()
X_scale = object.fit_transform(X)

In [23]:
X_scale

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

Linear Regression

Linear Regression assumes a linear relationship between input features (X) and target variable (Y).


In [25]:
#linear Regression

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.33, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mean_squared_error(y_pred_lr, y_test)


0.07084542083000965

In [29]:
mean_absolute_error(y_pred_lr, y_test)


0.2079781518435342

In [31]:
r2_score(y_pred_lr, y_test)


0.6623477529689703

 Decision Tree Regressor

A decision tree splits the data into branches based on feature values, recursively dividing it into smaller subsets.
It aims to reduce variance within each split to improve accuracy.


In [33]:
from sklearn.tree import DecisionTreeRegressor


dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)  
y_pred_dt = dt.predict(X_test)

mean_squared_error(y_pred_dt, y_test)


In [37]:
mean_absolute_error(y_pred_dt, y_test)


0.0797872340425532

In [39]:
r2_score(y_pred_dt, y_test)


0.6585956416464891

Random forest Regressor

An ensemble learning method that creates multiple Decision Trees and averages their outputs to improve stability and accuracy.
Reduces overfitting compared to a single Decision Tree.

In [41]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)  
y_pred_rf = rf.predict(X_test)

In [42]:
mean_squared_error(y_pred_rf, y_test)



0.03797127659574468

In [45]:
mean_absolute_error(y_pred_rf, y_test)


0.08702127659574468

In [47]:
r2_score(y_pred_rf, y_test)

0.7968694763928428

In [None]:
Support Vector Machine (SVM) Regressor

Tries to find a hyperplane that best fits the data while minimizing error within a margin (ε).
Uses kernel functions to handle non-linear relationships.
Instead of minimizing the absolute error, SVM minimizes the hinge loss, ensuring robust predictions.

In [49]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)  # Needs scaling
y_pred_svr = svr.predict(X_test)

In [51]:
mean_squared_error(y_pred_svr, y_test)

0.038047306664607275

In [53]:
mean_absolute_error(y_pred_svr, y_test)


0.13531694698779018

In [55]:
r2_score(y_pred_svr, y_test)

0.7888464643239743

K-Nearest Neighbors (k-NN) Regressor

k-NN is a non-parametric algorithm that predicts a value based on the average of the k-nearest neighbors in the training set.
It finds the k closest data points to the test point and takes the mean/weighted average of their target values.

In [57]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)  # Default: k=5

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

In [59]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [61]:
mean_squared_error(y_test, y_pred_knn)


0.033404255319148944

In [63]:
mean_absolute_error(y_test, y_pred_knn)


0.06489361702127659

In [65]:
r2_score(y_test, y_pred_knn)

0.854367830270137

### Best Performing Model:

#### Every model performs well but,
#### K-Nearest Neighbors (k-NN) is the best model as it has the lowest MSE and MAE and the highest R² Score (0.8544).
#### It predicts car prices with the least error and explains the most variance in the dataset.

