In [None]:
import pandas as pd
import numpy as np


In [None]:
data_file = "penguins.csv"

In [None]:
penguins = pd.read_csv(data_file)
penguins.head(10)

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit


In [None]:
species_vc = penguins.species.value_counts(normalize=True)
island_vc = penguins.island.value_counts(normalize=True)
sex_vc = penguins.sex.value_counts(normalize=True)

print(species_vc)
print(island_vc)
print(sex_vc)


Add a column based where the species ( target attribute) is ordinal encoded, use it for stratified split

In [None]:
#not used
spec_indices = list(penguins.species.unique())
spec_indices

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# ! The result should be a Dataframe her , not a Series
penguins_spec = penguins[["species"]]

In [None]:
penguins_spec

In [None]:
ordinal_encoder = OrdinalEncoder()
penguins_spec_encoded = ordinal_encoder.fit_transform(penguins_spec)
penguins_spec_encoded[:10]

In [None]:
penguins['spec_ord'] = penguins_spec_encoded

In [None]:
penguins

In [None]:
penguins.columns

In [None]:
penguins.info()

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(penguins, penguins["spec_ord"]):
    strat_train_set = penguins.loc[train_index]
    strat_test_set = penguins.loc[test_index]

Check the proportions of the test set

In [None]:
strat_test_set.spec_ord.value_counts() / len(strat_test_set)

Compare to the proportions of the whole dataset

In [None]:
penguins["spec_ord"].value_counts()  / len(penguins)

create train and test set

In [None]:
train_set, test_set = train_test_split(penguins, test_size=0.3, random_state=42)

create random train and test sets

In [None]:
def random_split(data, test_ratio):
    shuffled_indexes = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indexes[:test_set_size]
    train_indices = shuffled_indexes[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

random_train_set, random_test_set = random_split(penguins, 0.3)

Compare proportions

In [None]:
def data_proportions(data):
    return data["spec_ord"].value_counts()/ len(data)

compare_props = pd.DataFrame({"Overall": data_proportions(penguins),
                              "Standard" : data_proportions(test_set),
                                "Stratified": data_proportions(strat_test_set),
                                "Random": data_proportions(random_test_set)}).sort_index()


compare_props["Std. %error"] = 100 * compare_props["Standard"] / compare_props["Overall"] - 100
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

compare_props


Do an additional split for the validation set

In [None]:
# in the final solution we can call this without Strat prefix
strat_test_set, strat_validation_set = train_test_split(strat_test_set, test_size=0.5, random_state=42)

drop category columns

In [None]:
for set_ in (strat_train_set, strat_test_set, strat_validation_set):
    set_.drop("spec_ord", axis=1, inplace=True)

Visualize species

In [None]:
from matplotlib import pyplot as plt

In [None]:
fig, ax = plt.subplots(figsize=(6,6))

unique_species = penguins["species"].unique()

for species in unique_species: 
    data = penguins[penguins["species"]== species]
    ax.hist(data["island"], bins=5, label=species)

ax.set_xlabel("Island")
ax.set_ylabel("Count")
ax.set_title("Distribution of Species per Island")
ax.legend()

plt.show()

Prepare data - drop target column

In [None]:
penguins_train = strat_train_set.drop("species", axis=1)
penguins_labels = penguins["species"].copy() # this the lables only, but for the target we have to use the y_train encoded, and the unique values
penguins_labels

Another way to implement the ordinal encoder using the Columntransformer

In [None]:
from sklearn.compose import ColumnTransformer

cat_transformer = ColumnTransformer(
    transformers=[("species", OrdinalEncoder(), [0])]
)

In [None]:
y_train = cat_transformer.fit_transform(np.array(penguins.species).reshape(-1, 1))
y_train