In [1]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/iris-flower-dataset/IRIS.csv'
iris_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Data head:\n", iris_data.head())

# Check for missing values
print("\nMissing values:\n", iris_data.isnull().sum())

# Display statistical summary of the dataset
print("\nStatistical summary:\n", iris_data.describe())

# Display the distribution of the target variable
print("\nSpecies distribution:\n", iris_data['species'].value_counts())


Data head:
    sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

Missing values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Statistical summary:
        sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%       

In [2]:
# Map the species names to numerical values
iris_data['species'] = iris_data['species'].map({
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
})

# Separate features and target variable
X = iris_data.drop(columns=['species'])
y = iris_data['species']

# Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFirst 5 rows of scaled features:\n", pd.DataFrame(X_scaled, columns=X.columns).head())



First 5 rows of scaled features:
    sepal_length  sepal_width  petal_length  petal_width
0     -0.900681     1.032057     -1.341272    -1.312977
1     -1.143017    -0.124958     -1.341272    -1.312977
2     -1.385353     0.337848     -1.398138    -1.312977
3     -1.506521     0.106445     -1.284407    -1.312977
4     -1.021849     1.263460     -1.341272    -1.312977


In [3]:
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("\nTraining set shape:", X_train.shape)
print("Test set shape:", X_test.shape)



Training set shape: (120, 4)
Test set shape: (30, 4)


In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Gradient Boosting classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Train the model
gb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test_gb = gb_clf.predict(X_test)
y_pred_train_gb = gb_clf.predict(X_train)

# Evaluate the model's performance
accuracy_test_gb = accuracy_score(y_test, y_pred_test_gb)
accuracy_train_gb = accuracy_score(y_train, y_pred_train_gb)
report_test_gb = classification_report(y_test, y_pred_test_gb, target_names=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

print(f"Gradient Boosting - Training Accuracy: {accuracy_train_gb}")
print(f"Gradient Boosting - Testing Accuracy: {accuracy_test_gb}")
print("Gradient Boosting - Classification Report on Test Data:\n", report_test_gb)


Gradient Boosting - Training Accuracy: 1.0
Gradient Boosting - Testing Accuracy: 1.0
Gradient Boosting - Classification Report on Test Data:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
accuracy_test_rf = accuracy_score(y_test, rf_clf.predict(X_test))
accuracy_train_rf = accuracy_score(y_train, rf_clf.predict(X_train))

# Initialize and train KNN classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
accuracy_test_knn = accuracy_score(y_test, knn_clf.predict(X_test))
accuracy_train_knn = accuracy_score(y_train, knn_clf.predict(X_train))

# Create a comparison table
accuracy_comparison = pd.DataFrame({
    'Model': ['Gradient Boosting', 'Random Forest', 'K-Nearest Neighbors'],
    'Training Accuracy': [accuracy_train_gb, accuracy_train_rf, accuracy_train_knn],
    'Testing Accuracy': [accuracy_test_gb, accuracy_test_rf, accuracy_test_knn]
})

print("\nAccuracy Comparison:\n", accuracy_comparison)



Accuracy Comparison:
                  Model  Training Accuracy  Testing Accuracy
0    Gradient Boosting           1.000000               1.0
1        Random Forest           1.000000               1.0
2  K-Nearest Neighbors           0.958333               1.0


In [6]:
# Example new data
new_data = pd.DataFrame([[5.1, 3.5, 1.4, 0.2], [6.7, 3.0, 5.2, 2.3]], columns=X.columns)

# Standardize the new data
new_data_scaled = scaler.transform(new_data)

# Predict species for new data using the Gradient Boosting model
new_predictions = gb_clf.predict(new_data_scaled)
predicted_species = ['Iris-setosa' if pred == 0 else 'Iris-versicolor' if pred == 1 else 'Iris-virginica' for pred in new_predictions]

print("Predicted species for new data:", predicted_species)


Predicted species for new data: ['Iris-setosa', 'Iris-virginica']
