In [27]:
import os
import pandas as pd
from waste_management.config import PROCESSED_DATA_DIR
from loguru import logger
from sklearn.model_selection import train_test_split

In [25]:
# Log a message indicating that the dataset is being loaded
logger.info('Loading the dataset')

# Read the dataset from a CSV file
df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, 'dataset.csv'))

# Log the first few rows of the dataframe
logger.info('dataframe head - {}'.format(df.head()))

# Log a success message indicating that the dataset was loaded successfully
logger.success('Dataset loaded successfully')

[32m2024-07-11 21:51:01.938[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading the dataset[0m
[32m2024-07-11 21:51:01.945[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mdataframe head -                                             filepath      label
0  /mnt/e/Programowanie/waste_management/data/raw...  cardboard
1  /mnt/e/Programowanie/waste_management/data/raw...  cardboard
2  /mnt/e/Programowanie/waste_management/data/raw...  cardboard
3  /mnt/e/Programowanie/waste_management/data/raw...  cardboard
4  /mnt/e/Programowanie/waste_management/data/raw...  cardboard[0m
[32m2024-07-11 21:51:01.945[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [32m[1mDataset loaded successfully[0m


In [29]:
# Split the dataset into train and validation without stratification
logger.info('Splitting the dataset into train and validation')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Print the number of images in each dataset
logger.info('Number of images in the training set: {}'.format(len(train_df)))
logger.info('Number of images in the validation set: {}'.format(len(val_df)))

[32m2024-07-11 21:54:09.806[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSplitting the dataset into train and validation[0m
[32m2024-07-11 21:54:09.810[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of images in the training set: 2021[0m
[32m2024-07-11 21:54:09.810[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mNumber of images in the validation set: 506[0m


In [36]:
# Calculate the class distribution for the entire dataframe
df_class_distribution = df['label'].value_counts(normalize=True) * 100

# Calculate the class distribution for the training dataset
train_class_distribution = train_df['label'].value_counts(normalize=True) * 100

# Calculate the class distribution for the validation dataset
val_class_distribution = val_df['label'].value_counts(normalize=True) * 100

# Log the class distribution for the entire dataframe
logger.info("Class Distribution for df:")
logger.info(df_class_distribution.round(2))

# Log the class distribution for the training dataset
logger.info("Class Distribution for train_df:")
logger.info(train_class_distribution.round(2))

# Log the class distribution for the validation dataset
logger.info("Class Distribution for val_df:")
logger.info(val_class_distribution.round(2))

[32m2024-07-11 21:56:56.924[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mClass Distribution for df:[0m
[32m2024-07-11 21:56:56.924[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mlabel
paper        23.51
glass        19.83
plastic      19.07
metal        16.22
cardboard    15.95
trash         5.42
Name: proportion, dtype: float64[0m
[32m2024-07-11 21:56:56.925[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mClass Distribution for train_df:[0m
[32m2024-07-11 21:56:56.925[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mlabel
paper        23.55
glass        19.50
plastic      18.95
cardboard    16.43
metal        15.98
trash         5.59
Name: proportion, dtype: float64[0m
[32m2024-07-11 21:56:56.925[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mClass Distribution for val_df:[0m
[32m2024-07-11 21:56:56.925[0m | [1mINFO    

In [37]:
# Split the dataset into train and validation with stratification
logger.info('Splitting the dataset into train and validation')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Print the number of images in each dataset
logger.info('Number of images in the training set: {}'.format(len(train_df)))
logger.info('Number of images in the validation set: {}'.format(len(val_df)))

[32m2024-07-11 22:01:15.273[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mSplitting the dataset into train and validation[0m
[32m2024-07-11 22:01:15.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNumber of images in the training set: 2021[0m
[32m2024-07-11 22:01:15.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mNumber of images in the validation set: 506[0m


In [38]:
# Calculate the class distribution for the entire dataframe
df_class_distribution = df['label'].value_counts(normalize=True) * 100

# Calculate the class distribution for the training dataset
train_class_distribution = train_df['label'].value_counts(normalize=True) * 100

# Calculate the class distribution for the validation dataset
val_class_distribution = val_df['label'].value_counts(normalize=True) * 100

# Log the class distribution for the entire dataframe
logger.info("Class Distribution for df:")
logger.info(df_class_distribution.round(2))

# Log the class distribution for the training dataset
logger.info("Class Distribution for train_df:")
logger.info(train_class_distribution.round(2))

# Log the class distribution for the validation dataset
logger.info("Class Distribution for val_df:")
logger.info(val_class_distribution.round(2))

[32m2024-07-11 22:01:17.218[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mClass Distribution for df:[0m
[32m2024-07-11 22:01:17.219[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mlabel
paper        23.51
glass        19.83
plastic      19.07
metal        16.22
cardboard    15.95
trash         5.42
Name: proportion, dtype: float64[0m
[32m2024-07-11 22:01:17.219[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mClass Distribution for train_df:[0m
[32m2024-07-11 22:01:17.220[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mlabel
paper        23.50
glass        19.84
plastic      19.05
metal        16.23
cardboard    15.93
trash         5.44
Name: proportion, dtype: float64[0m
[32m2024-07-11 22:01:17.221[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mClass Distribution for val_df:[0m
[32m2024-07-11 22:01:17.221[0m | [1mINFO