In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
data = pd.read_csv("./datasets/final_data.csv")

In [3]:
data.drop(labels=["Unnamed: 0", "Id"], axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.7,3.2,1.6,0.2,Iris-setosa
2,4.9,3.1,1.5,0.1,Iris-setosa
3,4.4,2.9,1.4,0.2,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


### Label Encoding

In [6]:
le = LabelEncoder()

In [9]:
data["Species"] = le.fit_transform(data["Species"])

In [10]:
data.head(10)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0
5,4.6,3.4,1.4,0.3,0
6,5.4,3.9,1.7,0.4,0
7,5.0,3.6,1.4,0.2,0
8,4.6,3.1,1.5,0.2,0
9,4.7,3.2,1.3,0.2,0


In [11]:
data.isna().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [14]:
data.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int32
dtype: object

In [15]:
from sklearn.model_selection import train_test_split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.2)

In [31]:
y_train.value_counts()

1    44
2    43
0    36
Name: Species, dtype: int64

In [32]:
y_test.value_counts()

2    11
0    11
1     9
Name: Species, dtype: int64

### Creating Model

In [33]:
# !pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-win_amd64.whl (89.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3


In [35]:
import xgboost as xgb

In [38]:
xgb_cls = xgb.XGBClassifier(objective="multi:softmax", num_class=3)

In [39]:
xgb_cls.fit(x_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_class=3,
              num_parallel_tree=None, objective='multi:softmax', ...)

In [40]:
preds = xgb_cls.predict(x_test)

In [41]:
preds

array([0, 2, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 2,
       0, 2, 0, 1, 1, 1, 1, 2, 1])

In [42]:
import numpy as np

In [43]:
np.array(y_test)

array([0, 2, 1, 2, 0, 0, 1, 2, 0, 0, 1, 0, 2, 0, 0, 0, 2, 1, 2, 1, 2, 2,
       0, 2, 0, 1, 2, 1, 1, 2, 1])

In [44]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [45]:
accuracy_score(y_test, preds)

0.9354838709677419

In [46]:
confusion_matrix(y_test, preds)

array([[11,  0,  0],
       [ 0,  8,  1],
       [ 0,  1, 10]], dtype=int64)