# AIDI 2004 -  AI in Enterprise Systems
## Lab #2 (Dataset used for this lab is the Breast Cancer Wisconsin (Diagnostic) dataset)
### Submitted By: Batool Talha
### Submission Date: June 09, 2024

### Prepare Problem

In [1]:
# Load Libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:
# Get the dataset -> Breast Cancer Wisconsin (Diagnostic)
# https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

%pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Summarize Data/ Exploratory Data Analysis

In [3]:
# metadata 
# print(breast_cancer_wisconsin_diagnostic.metadata) 
breast_cancer_wisconsin_diagnostic.metadata

{'uci_id': 17,
 'name': 'Breast Cancer Wisconsin (Diagnostic)',
 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic',
 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv',
 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.',
 'area': 'Health and Medicine',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 569,
 'num_features': 30,
 'feature_types': ['Real'],
 'demographics': [],
 'target_col': ['Diagnosis'],
 'index_col': ['ID'],
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1993,
 'last_updated': 'Fri Nov 03 2023',
 'dataset_doi': '10.24432/C5DW2B',
 'creators': ['William Wolberg',
  'Olvi Mangasarian',
  'Nick Street',
  'W. Street'],
 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis',
  'authors': 'W. Street, W. Wolberg, O. Mangasarian',
  'published_in': 'Electronic imaging',
  'year': 1993,
  'url': 'https:

In [4]:
# variable information 
# print(breast_cancer_wisconsin_diagnostic.variables) 
breast_cancer_wisconsin_diagnostic.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Categorical,,,,no
1,Diagnosis,Target,Categorical,,,,no
2,radius1,Feature,Continuous,,,,no
3,texture1,Feature,Continuous,,,,no
4,perimeter1,Feature,Continuous,,,,no
5,area1,Feature,Continuous,,,,no
6,smoothness1,Feature,Continuous,,,,no
7,compactness1,Feature,Continuous,,,,no
8,concavity1,Feature,Continuous,,,,no
9,concave_points1,Feature,Continuous,,,,no


### Prepare Data

In [5]:
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 

In [22]:
X

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


### Model Building and Evaluate Algorithm 
#### ML Model #1 - Multiple Linear Regressor

In [6]:
# Split the dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 7)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Determine the number of input features
n_features = X_train.shape[1]
print(n_features)

(398, 30) (171, 30) (398, 1) (171, 1)
30


In [23]:
X_train

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
543,13.210,28.06,84.88,538.4,0.08671,0.06877,0.029870,0.032750,0.1628,0.05781,...,14.370,37.17,92.48,629.6,0.1072,0.13810,0.106200,0.07958,0.2473,0.06443
58,13.050,19.31,82.61,527.2,0.08060,0.03789,0.000692,0.004167,0.1819,0.05501,...,14.230,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
436,12.870,19.54,82.67,509.2,0.09136,0.07883,0.017970,0.020900,0.1861,0.06347,...,14.450,24.38,95.14,626.9,0.1214,0.16520,0.071270,0.06384,0.3313,0.07735
453,14.530,13.98,93.86,644.2,0.10990,0.09242,0.068950,0.064950,0.1650,0.06121,...,15.800,16.93,103.10,749.9,0.1347,0.14780,0.137300,0.10690,0.2606,0.07810
4,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.1374,0.20500,0.400000,0.16250,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,11.840,18.94,75.51,428.0,0.08871,0.06900,0.026690,0.013930,0.1533,0.06057,...,13.300,24.99,85.22,546.3,0.1280,0.18800,0.147100,0.06913,0.2535,0.07993
502,12.540,16.32,81.25,476.3,0.11580,0.10850,0.059280,0.032790,0.1943,0.06612,...,13.570,21.40,86.67,552.0,0.1580,0.17510,0.188900,0.08411,0.3155,0.07538
537,11.690,24.44,76.37,406.4,0.12360,0.15520,0.045150,0.045310,0.2131,0.07405,...,12.980,32.19,86.12,487.7,0.1768,0.32510,0.139500,0.13080,0.2803,0.09970
196,13.770,22.29,90.63,588.9,0.12000,0.12670,0.138500,0.065260,0.1834,0.06877,...,16.390,34.01,111.60,806.9,0.1737,0.31220,0.380900,0.16730,0.3080,0.09333


In [7]:
# Initialize the instance of Multiple Linear Regressor
mult_linear_reg = LinearRegression()
# Use all the defaults of the model as set in sci-kit learn
# LinearRegression(fit_intercept=True, copy_X=True, 
#                  n_jobs=None, positive=False)
# Fit the training data to the multi linear regression model
mult_linear_reg.fit(X_train, y_train)

ValueError: could not convert string to float: 'B'

In [78]:
# Predict the output using the training data using the created/ fitted model
y_pred_train = mult_linear_reg.predict(X_train)
# Calculate the performance metrics like mean absolute error, mean squared error, root mean squared error
# mean_absolute_error(y_train, y_pred_train)
#Print MSE and RMSE
print('Mean Absolute Error: {:.2f}'.format(mean_absolute_error(y_train, y_pred_train)))
print('Mean Squared Error: {:.2f}'.format(mean_squared_error(y_train, y_pred_train)))
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train)))) 

Mean Absolute Error: 126189.99
Mean Squared Error: 40925020393.45
Root Mean Squared Error: 202299.33


In [79]:
# Predict the output using the test data using the created/ fitted model
y_pred_test = mult_linear_reg.predict(X_test)
# Calculate the performance metrics like mean absolute error, mean squared error, root mean squared error
# mean_absolute_error(y_test, y_pred_test)
#Print MSE and RMSE
print('Mean Absolute Error: {:.2f}'.format(mean_absolute_error(y_test, y_pred_test)))
print('Mean Squared Error: {:.2f}'.format(mean_squared_error(y_test, y_pred_test)))
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test)))) 

Mean Absolute Error: 128664.32
Mean Squared Error: 43080626089.75
Root Mean Squared Error: 207558.73


In [8]:
# Import svm model
# from sklearn import svm

# Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

# Train the model using the training sets
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [10]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9532163742690059


In [11]:
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

ValueError: pos_label=1 is not a valid label. It should be one of ['B', 'M']