<a href="https://colab.research.google.com/github/amitverma7426/DataScience/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [20]:
# Set random seed for reproducibility
np.random.seed(42)

# Create a DataFrame
df = pd.DataFrame({
    'Feature1': np.random.normal(50, 5, 100),  # Normally distributed data
    'Feature2': np.random.uniform(20, 80, 100),  # Uniformly distributed data
    'Category': np.random.choice(['A', 'B', 'C'], 100),  # Categorical data
    'Target': np.random.choice([0, 1], 100)  # Binary target variable
})

# Introduce missing values
missing_indices = np.random.choice(df.index, 10, replace=False)
df.loc[missing_indices, 'Feature1'] = np.nan

# Introduce outliers in 'Feature1'
outlier_indices = np.random.choice(df.index, 5, replace=False)
df.loc[outlier_indices, 'Feature1'] = df['Feature1'] * 3

# Display the first few rows of the dataset
print(df.head())


     Feature1   Feature2 Category  Target
0   52.483571  45.044660        B       0
1  147.926035  33.326469        A       1
2   53.238443  27.191922        B       1
3   57.615149  40.256910        A       0
4         NaN  76.574582        C       1


In [21]:
# Impute missing values in 'Feature1' with the mean
df['Feature1'].fillna(df['Feature1'].mean(), inplace=True)

# Check for any remaining missing values
print(df.isnull().sum())


Feature1    0
Feature2    0
Category    0
Target      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Feature1'].fillna(df['Feature1'].mean(), inplace=True)


In [22]:
# Calculate Z-scores
z_scores = stats.zscore(df[['Feature1', 'Feature2']])

# Convert to DataFrame for easier handling
z_scores_df = pd.DataFrame(z_scores, columns=['Feature1_z', 'Feature2_z'])

# Identify outliers
outliers = (np.abs(z_scores_df) > 3).any(axis=1)

# Remove outliers
df_cleaned = df[~outliers]

# Display the number of removed outliers
print(f"Removed {outliers.sum()} outliers")


Removed 0 outliers


In [23]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical features
df_cleaned[['Feature1', 'Feature2']] = scaler.fit_transform(df_cleaned[['Feature1', 'Feature2']])

# Display the first few rows of the scaled data
print(df_cleaned.head())


       Feature1  Feature2 Category  Target
0 -7.948336e-02 -0.239956        B       0
1  4.508117e+00 -0.921168        A       1
2 -4.319919e-02 -1.277786        B       1
3  1.671745e-01 -0.518281        A       0
4 -3.415342e-16  1.592969        C       1


In [24]:
# Perform one-hot encoding
df_encoded = pd.get_dummies(df_cleaned, columns=['Category'])

# Display the first few rows of the encoded data
print(df_encoded.head())


       Feature1  Feature2  Target  Category_A  Category_B  Category_C
0 -7.948336e-02 -0.239956       0       False        True       False
1  4.508117e+00 -0.921168       1        True       False       False
2 -4.319919e-02 -1.277786       1       False        True       False
3  1.671745e-01 -0.518281       0        True       False       False
4 -3.415342e-16  1.592969       1       False       False        True


In [25]:
# Save the preprocessed DataFrame to a CSV file
df_encoded.to_csv('preprocessed_dummy_data.csv', index=False)

print('Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv')


Preprocessing complete. Preprocessed data saved as preprocessed_dummy_data.csv


In [26]:
# Check for missing values
print(df_encoded.isnull().sum())

# Verify the scaling
print(df_encoded.describe())

# Check the encoding
print(df_encoded.head())


Feature1      0
Feature2      0
Target        0
Category_A    0
Category_B    0
Category_C    0
dtype: int64
           Feature1      Feature2      Target
count  1.000000e+02  1.000000e+02  100.000000
mean  -2.420286e-16 -1.898481e-16    0.540000
std    1.005038e+00  1.005038e+00    0.500908
min   -8.284723e-01 -1.678219e+00    0.000000
25%   -3.241083e-01 -8.542966e-01    0.000000
50%   -1.738656e-01  7.387473e-02    1.000000
75%   -3.415342e-16  7.271373e-01    1.000000
max    5.943298e+00  1.742047e+00    1.000000
       Feature1  Feature2  Target  Category_A  Category_B  Category_C
0 -7.948336e-02 -0.239956       0       False        True       False
1  4.508117e+00 -0.921168       1        True       False       False
2 -4.319919e-02 -1.277786       1       False        True       False
3  1.671745e-01 -0.518281       0        True       False       False
4 -3.415342e-16  1.592969       1       False       False        True
