In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [4]:
data_bs1 = pd.read_csv('../data/preprocessed_data_BS1.csv')

In [5]:
X = data_bs1.drop(['Group'], axis=1)
y = data_bs1['Group']

In [6]:
y.head()

0    B
1    B
2    B
3    B
4    B
Name: Group, dtype: object

In [7]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
unique_classes = label_encoder.classes_
unique_labels = label_encoder.transform(unique_classes)

print("Unique Classes and Their Encoded Labels:")
for cls, label in zip(unique_classes, unique_labels):
    print(f"{cls} -> {label}")

Unique Classes and Their Encoded Labels:
B -> 0
D -> 1
M -> 2
NK -> 3
T -> 4


In [9]:
unique_values, value_counts = np.unique(y_encoded, return_counts=True)

for value, count in zip(unique_values, value_counts):
    print(f"{value}: {count}")

0: 1660
1: 142
2: 1661
3: 1394
4: 8326


In [10]:
oversample = SMOTE()
X_oversampled, y_oversampled = oversample.fit_resample(X, y_encoded)

In [11]:
y_resampled_original = label_encoder.inverse_transform(y_oversampled)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_resampled_original, test_size=0.3, stratify=y_resampled_original, random_state=1)


In [13]:
unique_values, value_counts = np.unique(y_train, return_counts=True)

for value, count in zip(unique_values, value_counts):
    print(f"{value}: {count}")

B: 5828
D: 5828
M: 5829
NK: 5828
T: 5828


In [17]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [19]:
data = pd.concat([X_train, X_test, y_train, y_test], axis=1)
data.to_csv('../data/oversampled_BS1.csv', index=False)