# Handling Imbalanced Pulsar Dataset and Preparing Training Data

## 1. Load the Dataset

In [None]:
import pandas as pd

df = pd.read_csv("pulsar_data_train.csv")
df.shape

(12528, 9)

## 2. Separate Class 0 and Class 1

In [2]:
class_0 = df[df['target_class'] == 0]
class_1 = df[df['target_class'] == 1]

class_0.shape, class_1.shape

((11375, 9), (1153, 9))

## 3. Random Sampling from Each Class

In [3]:
# Randomly sample required rows
sample_0 = class_0.sample(n=1100, random_state=42)
sample_1 = class_1.sample(n=1000, random_state=42)

sample_0.shape, sample_1.shape

((1100, 9), (1000, 9))

## 4. Combine the Sampled Data

In [4]:
combined_df = pd.concat([sample_0, sample_1], axis=0)
combined_df.shape

(2100, 9)

## 5. Shuffle the Combined Dataset

In [5]:
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
shuffled_df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,110.921875,46.868768,0.182154,0.092144,2.306856,18.209125,9.189732,93.543984,0.0
1,63.609375,33.49469,2.495789,11.79733,16.276756,52.562015,3.2548,9.28498,1.0
2,121.570312,54.283786,0.195833,-0.176091,1.454013,14.158635,12.302674,172.591159,0.0
3,31.757812,33.531409,5.142497,28.034495,29.35786,,2.163057,4.541288,1.0
4,100.789062,41.813509,0.43084,1.117147,3.063545,18.982107,8.487104,83.969008,0.0


## 6. Check Final Class Distribution

In [6]:
shuffled_df['target_class'].value_counts()

target_class
0.0    1100
1.0    1000
Name: count, dtype: int64

## 7. Split Features and Target (For Training)

In [7]:
X = shuffled_df.drop('target_class', axis=1)
y = shuffled_df['target_class']

X.shape, y.shape

((2100, 8), (2100,))

## 8. Save the Final Training Dataset

In [None]:
shuffled_df.to_csv("pulsar_balanced_training_data.csv", index=False)

## 9. Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((1680, 8), (420, 8))