In scikit-learn a random split into training and test sets can be quickly computed with the train_test_split helper function. 

In [4]:
#!pip install --upgrade scikit-learn


In [6]:
# Importing necessary libraries
import numpy as np  # NumPy for array manipulation
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn import datasets  # To load pre-built datasets from scikit-learn
from sklearn import svm  # Support Vector Machine (SVM) algorithm for classification

# Load the Iris dataset
iris = datasets.load_iris()

# Extract features (X) and target (y) from the dataset
x = iris.data  # Features: measurements of iris flowers
y = iris.target  # Target: species of the iris flowers

# Print the shapes of the feature matrix and target vector
print(x.shape, y.shape)  # Shape of the feature matrix (150 samples, 4 features), target vector (150 labels)


(150, 4) (150,)


--> x (features): This is your input data. In the case of the Iris dataset, it consists of numerical measurements (like sepal length, sepal width, petal length, and petal width). Each row in x is one sample, and each column represents one feature.

--> y (target/labels): This is the output or labels corresponding to the input data. For the Iris dataset, the target labels represent the species of the iris flower (0, 1, 2), where each number corresponds to a different species.

In [8]:
# Split the dataset into training and testing sets
# test_size=0.4 means 40% of the data will be used for testing, and 60% for training
# random_state=0 ensures reproducibility (data shuffling happens the same way every time)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)


In [9]:
# Print the shape of the training data (x_train: features, y_train: labels)
x_train.shape, y_train.shape  # (90, 4), (90,)

((90, 4), (90,))

In [10]:
# Print the shape of the testing data (x_test: features, y_test: labels)
x_test.shape, y_test.shape  # (60, 4), (60,)

((60, 4), (60,))

In [11]:
# Create and train an SVM classifier with a linear kernel
# C=1 is a regularization parameter (controls trade-off between a smooth decision boundary and classifying training points correctly)
clf = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)

In [12]:
# Test the model on the test data and return the accuracy score (percentage of correct predictions)
clf.score(x_test, y_test)

0.9666666666666667

In [13]:
%config IPCompleter.greedy=True
