In [10]:
import os
import sys
sys.path.append('../mlai_research/')
import log
import utils
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [11]:
logger = log.get_logger(__name__)

In [2]:
df = pd.read_csv('../data/04_features/features.csv')

In [3]:
df.shape

(44, 1582)

In [4]:
def encode_labels(df, column):
    """
    Encode labels in the specified column of the dataframe.

    Parameters:
    df (pandas.DataFrame): The dataframe containing the labels.
    column (str): The column name of the labels.

    Returns:
    df (pandas.DataFrame): The dataframe with encoded labels.
    """
    label_encoder = preprocessing.LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])
    return df


def split_data(df, test_size=0.1, val_size=0.1):
    """
    Split the dataframe into training, validation, and testing sets.

    Parameters:
    df (pandas.DataFrame): The dataframe to split.
    test_size (float): The proportion of the dataset to include in the test split.
    val_size (float): The proportion of the training set to include in the validation split.

    Returns:
    train_set (pandas.DataFrame): The training set.
    val_set (pandas.DataFrame): The validation set.
    test_set (pandas.DataFrame): The testing set.
    """
    train_set, test_set = train_test_split(df, test_size=test_size, random_state=42)
    train_set, val_set = train_test_split(train_set, test_size=val_size, random_state=42)
    return train_set, val_set, test_set


def upsample_minorities(df, label_column):
    """
    Upsample the minority classes in the dataframe to match the majority class count.

    Parameters:
    df (pandas.DataFrame): The dataframe to upsample.
    label_column (str): The column name of the labels.

    Returns:
    df_upsampled (pandas.DataFrame): The dataframe with the minority classes upsampled.
    """
    # Get the count of the most frequent class
    majority_class_count = df[label_column].value_counts().max()
    
    # Separate the majority and minority classes
    df_majority = df[df[label_column] == df[label_column].value_counts().idxmax()]
    df_minorities = df[df[label_column] != df[label_column].value_counts().idxmax()]
    
    # List to hold the upsampled dataframes
    upsampled_list = [df_majority]
    
    # Upsample each minority class and add to the list
    for class_index in df_minorities[label_column].unique():
        df_class_minority = df_minorities[df_minorities[label_column] == class_index]
        df_class_upsampled = resample(df_class_minority, 
                                      replace=True, 
                                      n_samples=majority_class_count, 
                                      random_state=42)
        upsampled_list.append(df_class_upsampled)
    
    # Concatenate all upsampled minority class DataFrames with the majority class DataFrame
    df_upsampled = pd.concat(upsampled_list)
    
    return df_upsampled

In [5]:
df.head()

Unnamed: 0,label,f_0_rgb,f_1_rgb,f_2_rgb,f_3_rgb,f_4_rgb,f_5_rgb,f_6_rgb,f_7_rgb,f_8_rgb,...,f_168_hyps,f_169_hyps,f_170_hyps,f_171_hyps,f_172_hyps,f_173_hyps,f_0_chm,f_1_chm,f_2_chm,f_3_chm
0,Xanthium,0.532223,0.032648,0.02206,0.017795,0.017353,0.03353,0.079855,0.104268,0.111768,...,82.63839,81.231659,79.121857,74.907944,67.853271,48.743698,1.699564,252.0,0.0,252.0
1,Xanthium,0.532223,0.032648,0.02206,0.017795,0.017353,0.03353,0.079855,0.104268,0.111768,...,75.123596,73.695869,71.759537,67.005653,61.071106,48.896381,0.0,0.0,0.0,0.0
2,Xanthium,0.577344,0.0,0.0,0.000852,0.000775,0.00062,0.001007,0.001549,0.00062,...,59.974102,55.185238,49.004799,31.488201,0.0,0.0,0.866561,6.0,0.0,6.0
3,Xanthium,0.577344,0.0,0.0,0.000852,0.000775,0.00062,0.001007,0.001549,0.00062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.039635,6.0,0.0,6.0
4,Xanthium,0.45396,0.0,0.000145,0.000436,0.001164,0.002473,0.004364,0.009891,0.080145,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df = encode_labels(df, 'label')
_train, val, test = split_data(df)
train = upsample_minorities(_train, 'label')

In [7]:
train['label'].value_counts()

label
2    16
0    16
1    16
Name: count, dtype: int64

In [9]:
train.shape

(48, 1582)

In [None]:
def main():
    conf = utils.load_config("base")
    df = pd.read_csv(f"{conf.data.path_feat}{conf.data.fn_feat}")
    df = encode_labels(df, 'label')
    train, val, test = split_data(df)
    train = upsample_minorities(train, 'label')
    # train.to_csv(f"{conf.data.path_mi}{conf.data.fn_train}", index=False)
    # val.to_csv(f"{conf.data.path_mi}{conf.data.fn_val}", index=False)
    # test.to_csv(f"{conf.data.path_mi}{conf.data.fn_test}", index=False)
    # Separate features and labels
    X_train = train.drop('label', axis=1).values
    y_train = train['label'].values

    X_val = val.drop('label', axis=1).values
    y_val = val['label'].values

    X_test = test.drop('label', axis=1).values
    y_test = test['label'].values

    # Save as npz files
    np.savez(f"{conf.data.path_mi}{conf.data.fn_train}", X=X_train, y=y_train)
    np.savez(f"{conf.data.path_mi}{conf.data.fn_val}", X=X_val, y=y_val)
    np.savez(f"{conf.data.path_mi}{conf.data.fn_test}", X=X_test, y=y_test)