In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_excel('/content/2.xlsx')

# Remove unnecessary columns
data = data.drop(columns=['name'], errors='ignore')

# Drop rows with any missing values
data.dropna(inplace=True)

# Encode categorical features if needed
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))

# Splitting features and target
X = data.drop('steato score', axis=1).values
y = data['steato score'].values

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,shuffle=True)

# Save the test data to an Excel file
test_data = pd.DataFrame(X_test, columns=data.drop('steato score', axis=1).columns)
test_data['steato score'] = y_test
test_data.to_excel('/content/test_data_2.xlsx', index=False)

D_data = pd.DataFrame(X, columns=data.drop('steato score', axis=1).columns)
D_data['steato score'] = y
D_data.to_excel(f'/content/data_REAL_2.xlsx', index=False)

# Separate training data by class for augmentation
classes = np.unique(y_train)
class_data = {cls: X_train[y_train == cls] for cls in classes}

# Function to add Gaussian noise for each class
def augment_with_noise(class_data, target_size):
    augmented_data = []
    for cls, samples in class_data.items():
        current_size = len(samples)
        if current_size < target_size:
            # Calculate how many samples to generate
            n_to_generate = target_size - current_size

            # Add Gaussian noise
            noise = np.random.normal(0, 0.2, size=(n_to_generate, samples.shape[1]))
            synthetic_samples = samples[np.random.choice(current_size, n_to_generate, replace=True)] + noise

            # Combine original and synthetic data
            augmented_data.append(np.vstack((samples, synthetic_samples)))
        else:
            augmented_data.append(samples)
    return np.vstack(augmented_data), np.hstack([[cls] * target_size for cls in classes])

# Augment training data using Gaussian noise
target_size = 300  # Target number of samples per class
X_train_augmented, y_train_augmented = augment_with_noise(class_data, target_size)

# Save the augmented training data to an Excel file
augmented_train_data = pd.DataFrame(X_train_augmented, columns=data.drop('steato score', axis=1).columns)
augmented_train_data['steato score'] = y_train_augmented
augmented_train_data.to_excel('/content/augmented_train_data_noise.xlsx', index=False)

print("Augmented training data generation using Gaussian Noise complete. File saved.")


Augmented training data generation using Gaussian Noise complete. File saved.
