In [4]:
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import math
from xgboost import XGBClassifier

# Preparing the data

In [None]:
# fetch dataset 
iris = fetch_ucirepo(id=53)

In [20]:
iris = pd.read_csv('iris/iris.data', header=None)

In [28]:
iris

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [29]:
iris.shape

(150, 5)

In [31]:
type(iris)

pandas.core.frame.DataFrame

In [None]:
# data (as pandas dataframes) 
X = iris.data.features
y = iris.data.targets

In [32]:
X = iris.iloc[:, 0:-1]
y = iris.iloc[:, -1]

In [36]:
print(f'X shape: {X.shape}, y shape: {y.shape}')

X shape: (150, 4), y shape: (150,)


In [38]:
y[:5]

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: 4, dtype: object

In [40]:
le = LabelEncoder()
label = le.fit_transform(y)

In [46]:
y_np = label

In [47]:
type(y_np)

numpy.ndarray

In [48]:
y_np

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
y.drop('class', axis=1, inplace=True)

In [None]:
y['class'] = label

In [49]:
X_np = X.to_numpy()
# y_np = y.to_numpy()

In [53]:
print(f'X: {X_np[:5]}, Shape X: {X_np.shape}')
print(f'y: {y_np[:5]}, Shape y: {y_np.shape}')

X: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]], Shape X: (150, 4)
y: [0 0 0 0 0], Shape y: (150,)


In [72]:
y_np = y_np.reshape(150, 1)

In [73]:
y_np.shape

(150, 1)

In [78]:
y_np[-5:]

array([[2],
       [2],
       [2],
       [2],
       [2]])

# XGBoost

In [79]:
model = XGBClassifier()

In [80]:
model.fit(X_np, y_np)

In [81]:
y_pred = model.predict(X_np)

In [82]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [83]:
print(f'Train accuracy: {model.score(X_np, y_np)}')

Train accuracy: 1.0


## Splitting

In [86]:
from sklearn.model_selection import train_test_split

In [92]:
x_train, x_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.40, random_state=42)

In [88]:
# x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=42)

In [89]:
del x_, y_

In [93]:
print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
# print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
# print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")

the shape of the training set (input) is: (90, 4)
the shape of the training set (target) is: (90, 1)

the shape of the test set (input) is: (60, 4)
the shape of the test set (target) is: (60, 1)


In [94]:
model.fit(x_train, y_train)

In [95]:
yhat_tr = model.predict(x_train)

In [97]:
yhat_ts = model.predict(x_test)

In [108]:
yhat_tr

array([2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 0, 1, 2, 0,
       0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0, 1,
       2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1, 1, 1, 0, 1, 1, 0, 1,
       2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 0,
       1, 2])

In [109]:
y_train

array([[2],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [1],
       [1],
       [2],
       [1],
       [0],
       [1],
       [2],
       [0],
       [0],
       [1],
       [1],
       [0],
       [2],
       [0],
       [0],
       [1],
       [1],
       [2],
       [1],
       [2],
       [2],
       [1],
       [0],
       [0],
       [2],
       [2],
       [0],
       [0],
       [0],
       [1],
       [2],
       [0],
       [2],
       [2],
       [0],
       [1],
       [1],
       [2],
       [1],
       [2],
       [0],
       [2],
       [1],
       [2],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [2],
       [2],
       [0],
       [1],
       [2],
       [2],
       [0],
       [2],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [2],
    

In [106]:
print(f'Train accuracy: {model.score(x_train, y_train)}')

Train accuracy: 1.0


In [107]:
print(f'Test accuracy: {model.score(x_test, y_test)}')

Test accuracy: 0.9833333333333333


## Error evaluation

In [113]:
from sklearn.metrics import mean_squared_error

In [118]:
print(f'Train error: {mean_squared_error(y_train, yhat_tr)}')

Train error: 0.0


In [121]:
print(f'Test error: {mean_squared_error(y_test, yhat_ts)}')

Test error: 0.016666666666666666
