In [None]:
pip install ucimlrepo

In [None]:
pip install xgboost

In [1]:
import pandas as pd
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
zoo = fetch_ucirepo(id=111) 

# data (as pandas dataframes) 
X = zoo.data.features 
y = zoo.data.targets 

In [2]:
df = pd.concat([X,y],axis=1).sample(frac=1).reset_index(drop=True)

In [3]:
X = df.drop(columns='type')
y = df['type']

In [4]:
X

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1
2,0,0,1,0,0,1,1,0,0,0,0,0,6,0,0,0
3,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,1
4,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0,0,1,0,1,0,1,0,0,1,0,0,6,0,0,0
97,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,1
98,0,1,1,0,0,1,1,0,1,1,0,0,2,1,0,1
99,0,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0


In [5]:
y = y-1

In [6]:
y

0      3
1      0
2      6
3      0
4      1
      ..
96     5
97     1
98     1
99     5
100    0
Name: type, Length: 101, dtype: int64

In [7]:
X.dtypes

hair        int64
feathers    int64
eggs        int64
milk        int64
airborne    int64
aquatic     int64
predator    int64
toothed     int64
backbone    int64
breathes    int64
venomous    int64
fins        int64
legs        int64
tail        int64
domestic    int64
catsize     int64
dtype: object

In [8]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,accuracy_score,f1_score,classification_report

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=10,stratify=y)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [11]:
clf_rf = RandomForestClassifier(random_state=42)
clf_rf.fit(X_train,y_train)
pred_rf = clf_rf.predict(X_test)

pres = precision_score(y_test,pred_rf,average='macro')
acc = accuracy_score(y_test,pred_rf)
print(pres,acc)

0.7738095238095238 0.9047619047619048


In [12]:
clf_xgb = XGBClassifier()
clf_xgb.fit(X_train,y_train)
pred_xgb = clf_xgb.predict(X_test)

pres = precision_score(y_test,pred_xgb,average='macro')
acc = accuracy_score(y_test,pred_xgb)
print(pres,acc)

0.7738095238095238 0.9047619047619048


In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
rf_params = {
    'min_samples_split': [1,2,3],
    'n_estimators': [100, 200],        # Number of trees
    'max_depth': [10, 20,None],            # Max depth of trees
    'min_samples_leaf': [1, 2, 4],          # Min samples at a leaf node
}

In [15]:
# Grid search for Random Forest
rf_grid_search = GridSearchCV(estimator=clf_rf, param_grid=rf_params, 
                              cv=3, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train, y_train)

print("Best RandomForest Parameters:", rf_grid_search.best_params_)
print("Best RandomForest Score:", rf_grid_search.best_score_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best RandomForest Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best RandomForest Score: 0.9876543209876543


In [16]:
best_estomator = rf_grid_search.best_estimator_

In [17]:
pred_rf = best_estomator.predict(X_test)

pres = precision_score(y_test,pred_rf,average='macro')
acc = accuracy_score(y_test,pred_rf)
print(pres,acc)

0.7738095238095238 0.9047619047619048


In [18]:
xgb_params = {
    'n_estimators': [100, 200, 300],      # Number of boosting rounds
    'max_depth': [3, 6, 10],              # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],    # Step size shrinkage
    'subsample': [0.7, 0.8, 1.0],         # Fraction of samples used for boosting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Fraction of features used per tree
}

In [19]:
# Grid search for Random Forest
xgb_grid_search = GridSearchCV(estimator=clf_xgb, param_grid=rf_params, 
                              cv=3, n_jobs=-1, verbose=2)
xgb_grid_search.fit(X_train, y_train)

print("Best RandomForest Parameters:", xgb_grid_search.best_params_)
print("Best RandomForest Score:", xgb_grid_search.best_score_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best RandomForest Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 1, 'n_estimators': 100}
Best RandomForest Score: 0.9753086419753086


In [20]:
best_estomator = xgb_grid_search.best_estimator_
pred_rf = best_estomator.predict(X_test)

pres = precision_score(y_test,pred_rf,average='macro')
acc = accuracy_score(y_test,pred_rf)
print(pres,acc)

0.7738095238095238 0.9047619047619048
