Author: Python Engineer: Machine Learning algorithm implementations from scratch. https://www.youtube.com/playlist?list=PLqnslRFeH2Upcrywf-u2etjdxxkL8nl7E

In [1]:
import csv
import numpy as np
import pandas as pd

# Download data from https://archive.ics.uci.edu/ml/datasets/spambase
FILE_NAME = "spambase.data"

# 1) load with csv file, but this method is not recommended
with open(FILE_NAME, "r") as f:
    data = list(csv.reader(f, delimiter=","))

data = np.array(data, dtype=np.float32)
print(data.shape)

# split into X and y
n_samples, n_features = data.shape
n_features -= 1
print('n_features=',n_features)

X = data[:, 0:n_features]
y = data[:, n_features]

print(X.shape, y.shape)
print(X[0, 0:5])
print(y)
# or if y is the first column
# X = data[:, 1:n_features+1]
# y = data[:, 0]

(4601, 58)
n_features= 57
(4601, 57) (4601,)
[0.   0.64 0.64 0.   0.32]
[1. 1. 1. ... 0. 0. 0.]


In [2]:
# 2) load with np.loadtxt()
# skiprows=1
data = np.loadtxt(FILE_NAME, delimiter=",", dtype=np.float32)
print(data.shape, data.dtype)

# split into X and y
n_samples, n_features = data.shape
n_features -= 1

X = data[:, 0:n_features]
y = data[:, n_features]

print(X.shape, y.shape)
print(X[0, 0:5])
# or if y is the first column
# X = data[:, 1:n_features+1]
# y = data[:, 0]

(4601, 58) float32
(4601, 57) (4601,)
[0.   0.64 0.64 0.   0.32]


In [3]:
# 3) load with np.genfromtxt(), this is the recommended method which can deal with the missing data
# skip_header=0, missing_values="---", filling_values=0.0
data = np.genfromtxt(FILE_NAME, delimiter=",", dtype=np.float32)
# data = np.genfromtxt(FILE_NAME, delimiter=",", dtype=np.float32, skip_header=1) # if there is header in the first line
print(data.shape, data.dtype)

# split into X and y
n_samples, n_features = data.shape
n_features -= 1

X = data[:, 0:n_features]
y = data[:, n_features]

print(X.shape, y.shape)
print(X[0, 0:5])
# or if y is the first column
# X = data[:, 1:n_features+1]
# y = data[:, 0]

(4601, 58) float32
(4601, 57) (4601,)
[0.   0.64 0.64 0.   0.32]


In [4]:
# 4) load with pandas: read_csv()
# na_values = ['---']
df = pd.read_csv(FILE_NAME, header=None, skiprows=0, dtype=np.float32)
# df = pd.read_csv(FILE_NAME, header=None, skiprows=1, dtype=np.float32) # if there is header in the first line
df = df.fillna(0.0)

# dataframe to numpy
data = df.to_numpy()
print(data[4, 0:5],data.dtype)

# split into X and y
n_samples, n_features = data.shape
n_features -= 1

X = data[:, 0:n_features]
y = data[:, n_features]

print(X.shape, y.shape)
print(X[0, 0:5])

# convert datatypes in numpy
# data = np.asarray(data, dtype = np.float32)
# print(data.dtype)


[0.   0.   0.   0.   0.63] float32
(4601, 57) (4601,)
[0.   0.64 0.64 0.   0.32]
