In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Importing wine data into a pandas dataframe

In [81]:
winedf = pd.read_csv("winequality-red.csv", sep=';')

Make sure everything imported correctly

In [82]:
print(winedf.shape)
winedf.head()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Check for any missing values

In [83]:
winedf.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

We're not sure that the data is completely clean yet. Let's try importing the data into numpy and see if there are any missing values:

In [84]:
wineData = winedf.to_numpy()
wineData

array([[ 7.4  ,  0.7  ,  0.   , ...,  0.56 ,  9.4  ,  5.   ],
       [ 7.8  ,  0.88 ,  0.   , ...,  0.68 ,  9.8  ,  5.   ],
       [ 7.8  ,  0.76 ,  0.04 , ...,  0.65 ,  9.8  ,  5.   ],
       ...,
       [ 6.3  ,  0.51 ,  0.13 , ...,  0.75 , 11.   ,  6.   ],
       [ 5.9  ,  0.645,  0.12 , ...,  0.71 , 10.2  ,  5.   ],
       [ 6.   ,  0.31 ,  0.47 , ...,  0.66 , 11.   ,  6.   ]])

Checking for any NaN values in the matrix

In [85]:
np.isnan(wineData).astype(int).sum()

0

The data is clean.

Change last column to 1/0 labels (1 if 6 or higher, 0 if lower than 6)

In [86]:
print(wineData)
print("Changing last column...")
wineData[:,-1] = (wineData[:,-1]>=6).astype(int)
print(wineData)

[[ 7.4    0.7    0.    ...  0.56   9.4    5.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    5.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    5.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     6.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    5.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     6.   ]]
Changing last column...
[[ 7.4    0.7    0.    ...  0.56   9.4    0.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    0.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    0.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     1.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    0.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     1.   ]]


Import breast cancer data into a pandas dataframe

In [124]:
bcdf = pd.read_csv("breast-cancer-wisconsin.data", sep=',', header=None)

Add columns

In [125]:
bcdf.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses', 'Class']

Make sure everything imported correctly

In [126]:
print(bcdf.shape)
bcdf.head()

(699, 11)


Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Check for missing values

In [127]:
bcdf.isnull().sum()

Sample code                    0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

We're not sure that the data is completely clean yet. Let's try importing the data into numpy and see if there are any missing values:

In [128]:
bcData = np.genfromtxt("breast-cancer-wisconsin.data", delimiter=",")
bcData

array([[1.000025e+06, 5.000000e+00, 1.000000e+00, ..., 1.000000e+00,
        1.000000e+00, 2.000000e+00],
       [1.002945e+06, 5.000000e+00, 4.000000e+00, ..., 2.000000e+00,
        1.000000e+00, 2.000000e+00],
       [1.015425e+06, 3.000000e+00, 1.000000e+00, ..., 1.000000e+00,
        1.000000e+00, 2.000000e+00],
       ...,
       [8.888200e+05, 5.000000e+00, 1.000000e+01, ..., 1.000000e+01,
        2.000000e+00, 4.000000e+00],
       [8.974710e+05, 4.000000e+00, 8.000000e+00, ..., 6.000000e+00,
        1.000000e+00, 4.000000e+00],
       [8.974710e+05, 4.000000e+00, 8.000000e+00, ..., 4.000000e+00,
        1.000000e+00, 4.000000e+00]])

Checking for any NaN values in the matrix

In [129]:
np.isnan(bcData).astype(int).sum()

16

No missing entries were found on the pandas dataframe but 16 were found using a numpy array. This probably means that the missing values are some non-numerical character. The text file accompanying the dataset shows this to be the '?' character. We need to replace this with a NaN character.

In [130]:
bcdf.replace('?', np.NaN, inplace=True)

16 missing entries: How many entries are missing per column?

In [131]:
bcdf.isna().sum()

Sample code                     0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

We need to drop those rows with missing values.

Number of rows before drop:

In [132]:
original_shape = bcdf.shape
original_shape[0]

699

Drop rows:

In [133]:
bcdf = bcdf.dropna()

Number of rows after drop:

In [134]:
bcdf.shape[0]

683

Remove first column of ID numbers because they don't affect the output:

In [135]:
bcdf.drop(['Sample code'], axis=1, inplace=True)
bcdf

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
5,8,10,10,8,7,10,9,7,1,4
6,1,1,1,1,2,10,3,1,1,2
7,2,1,2,1,2,1,3,1,1,2
8,2,1,1,1,2,1,1,1,5,2
9,4,2,1,1,2,1,2,1,1,2


## Task 2

### Implementing Logistic Regression from scratch