# Label Encoding 

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
data = pd.read_csv("final_data.csv")

In [4]:
data.drop(["Unnamed: 0","Id"], axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.7,3.2,1.6,0.2,Iris-setosa
2,4.9,3.1,1.5,0.1,Iris-setosa
3,4.4,2.9,1.4,0.2,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


In [6]:
le = LabelEncoder() #creating object

In [8]:
data["Species"] = le.fit_transform(data["Species"])

In [9]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.7,3.2,1.6,0.2,0
2,4.9,3.1,1.5,0.1,0
3,4.4,2.9,1.4,0.2,0
4,5.0,3.4,1.5,0.2,0


# Controlling Dataset

In [10]:
data.isna().sum() #to make sure that there is no nan value

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [13]:
data.dtypes # to make sure that there is no String Type

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species            int64
dtype: object

#  Splitting dataset as train & test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2)
#X_test = 80% of first four column
#X_test = 20% of first four column
#y_train = 80% of Species
#y_test = 20% of Species

In [18]:
y_train.value_counts() #checking class distribution is balanced or not

2    45
1    41
0    37
Name: Species, dtype: int64

In [20]:
y_test.value_counts()

1    12
0    10
2     9
Name: Species, dtype: int64

# MODEL

In [22]:
import xgboost as xgb 

In [23]:
xgb_classifier = xgb.XGBClassifier(objective="multiclass:softmax", num_class=3)

In [24]:
xgb_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_class=3, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1,
       tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
preds = xgb_classifier.predict(X_test)

In [26]:
X_test

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
17,4.8,3.4,1.9,0.2
37,4.4,3.2,1.3,0.2
38,5.1,3.8,1.9,0.4
144,5.8,2.8,5.1,1.180357
7,5.0,3.6,1.4,0.2
34,4.9,3.1,1.5,0.1
112,7.4,2.8,6.1,1.9
52,4.9,2.4,3.3,1.0
83,5.0,2.3,3.3,1.0
113,7.2,3.0,5.8,1.6


In [27]:
preds

array([0, 0, 0, 2, 0, 0, 2, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 2, 2,
       1, 2, 1, 1, 2, 0, 0, 1, 1], dtype=int64)

In [28]:
import numpy as np
np.array(y_test)

array([0, 0, 0, 2, 0, 0, 2, 1, 1, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 2, 2,
       1, 2, 1, 1, 2, 0, 0, 1, 1], dtype=int64)

# METRICS

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [30]:
accuracy_score(y_test, preds)

0.967741935483871

In [31]:
confusion_matrix(y_test, preds)

array([[10,  0,  0],
       [ 0, 12,  0],
       [ 0,  1,  8]], dtype=int64)