In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("Datasets/pulsar_data_test.csv")

In [3]:
df.shape

(5370, 9)

In [4]:
# Display the percentage of missing values in each column
missing_percentage = df.isnull().mean() * 100
missing_percentage

 Mean of the integrated profile                    0.000000
 Standard deviation of the integrated profile      0.000000
 Excess kurtosis of the integrated profile        14.283054
 Skewness of the integrated profile                0.000000
 Mean of the DM-SNR curve                          0.000000
 Standard deviation of the DM-SNR curve            9.757914
 Excess kurtosis of the DM-SNR curve               0.000000
 Skewness of the DM-SNR curve                      4.543762
target_class                                     100.000000
dtype: float64

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Separating features and target variable
X = df.drop("target_class", axis=1)
y = df["target_class"]

# Initialize the imputer
df_imputed = IterativeImputer()
# Fit and transform the data
df_imputed = pd.DataFrame(df_imputed.fit_transform(X), columns=X.columns)

In [6]:
from scipy.stats.mstats import winsorize

# Winsorizing each feature separately at 1% and 98% percentiles
for column in df_imputed.columns[:-1]:  # Exclude the target_class
    df_imputed[column] = winsorize(df_imputed[column], limits=[0.01, 0.02])

# Check the first few rows to ensure the transformation was applied
df_imputed.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve
0,116.90625,48.920605,0.186046,-0.129815,3.037625,17.737102,8.122621,78.813405
1,75.585938,34.386254,2.025498,8.652913,3.76505,21.897049,7.048189,55.878791
2,103.273438,46.996628,0.504295,0.821088,2.244983,15.622566,9.330498,105.134941
3,101.078125,48.587487,1.011427,1.15187,81.88796,81.464136,0.485105,-1.117904
4,113.226562,48.608804,0.291538,0.29212,6.291806,26.585056,4.540138,21.708268


In [7]:
from sklearn.preprocessing import StandardScaler

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_imputed)

# Creating a new DataFrame with the standardized features
df_imputed_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [8]:
# Features to transform due to high skewness
features_to_transform = [
    ' Excess kurtosis of the integrated profile',
    ' Skewness of the integrated profile',
    ' Mean of the DM-SNR curve',
    ' Standard deviation of the DM-SNR curve',
    ' Skewness of the DM-SNR curve'
]

# Applying square root transformation to selected features
for feature in features_to_transform:
    df_imputed_scaled[feature] = df_imputed_scaled[feature].apply(lambda x: np.sqrt(np.abs(x)))

# Recalculating skewness after transformation
new_skewness = df_imputed_scaled[features_to_transform].skew()

# Displaying the new skewness values
new_skewness

Excess kurtosis of the integrated profile    2.087595
Skewness of the integrated profile           3.218994
Mean of the DM-SNR curve                     2.873191
Standard deviation of the DM-SNR curve       0.982696
Skewness of the DM-SNR curve                 1.181343
dtype: float64

In [9]:
# Adding the target variable back to the DataFrame
df_imputed_scaled['target_class'] = y.values

# saving the processed file
df_imputed_scaled.to_csv("../Datasets/Processed_data/pulsar_data_test_processed.csv", index=False)