In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('D:\Programming\VidyutAiHackathon\data\Full-Data.csv')

In [4]:
df.head() 

Unnamed: 0,SoC,Temperature,Voltage,Label
0,100.0,298.15,4.0143,0
1,99.173138,298.849283,3.91682,0
2,98.346276,299.665201,3.887562,0
3,97.519413,300.497825,3.877287,0
4,96.692551,301.327592,3.870545,0


In [5]:
df.shape

(1152, 4)

In [7]:
import pandas as pd

# --- Configuration ---
# The name of your original, larger dataset file.
INPUT_CSV_PATH = 'D:\Programming\VidyutAiHackathon\data\Full-Data.csv'
# The name for the new, balanced dataset file that will be created.
OUTPUT_CSV_PATH = 'D:\Programming\VidyutAiHackathon\data\data2.csv'
# The column containing the labels (0, 1, 2).
LABEL_COLUMN = 'Label'
# The number of samples to select for each label.
SAMPLES_PER_LABEL = 150
# Random state for reproducibility of the random sampling.
RANDOM_STATE = 42

# --- Main Script ---
try:
    # 1. Load the original dataset from the CSV file.
    print(f"Loading original dataset from '{INPUT_CSV_PATH}'...")
    df = pd.read_csv(INPUT_CSV_PATH)
    print(f"Original dataset loaded successfully. Shape: {df.shape}")
    print("\nOriginal label distribution:")
    print(df[LABEL_COLUMN].value_counts())

    # 2. Group the data by the label and sample from each group.
    # - It will randomly select SAMPLES_PER_LABEL from each group.
    # - If a group has fewer samples than needed, it will sample with replacement.
    print(f"\nSampling {SAMPLES_PER_LABEL} rows for each label...")
    balanced_df = df.groupby(LABEL_COLUMN, group_keys=False).apply(
        lambda x: x.sample(n=SAMPLES_PER_LABEL, random_state=RANDOM_STATE, replace=True)
    )

    # 3. Shuffle the resulting DataFrame to mix the labels.
    # This ensures that the data is not sorted by label, which is good for training.
    print("Shuffling the new dataset...")
    balanced_df = balanced_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

    # 4. Save the new, balanced DataFrame to a new CSV file.
    print(f"Saving the balanced dataset to '{OUTPUT_CSV_PATH}'...")
    balanced_df.to_csv(OUTPUT_CSV_PATH, index=False)

    print("\nNew balanced dataset created successfully!")
    print(f"Final shape: {balanced_df.shape}")
    print("\nNew label distribution:")
    print(balanced_df[LABEL_COLUMN].value_counts())

except FileNotFoundError:
    print(f"ERROR: The file '{INPUT_CSV_PATH}' was not found.")
    print("Please make sure the CSV file is in the same directory as this script.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Loading original dataset from 'D:\Programming\VidyutAiHackathon\data\Full-Data.csv'...
Original dataset loaded successfully. Shape: (1152, 4)

Original label distribution:
Label
0    424
1    364
2    364
Name: count, dtype: int64

Sampling 150 rows for each label...
Shuffling the new dataset...
Saving the balanced dataset to 'D:\Programming\VidyutAiHackathon\data\data2.csv'...

New balanced dataset created successfully!
Final shape: (450, 4)

New label distribution:
Label
2    150
0    150
1    150
Name: count, dtype: int64


  balanced_df = df.groupby(LABEL_COLUMN, group_keys=False).apply(
