<a href="https://colab.research.google.com/github/atharvchothe/Machine-Learning/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Original dataset
data = {
    'Age': [30, 15, 65, None, 23],
    'Salary': [35500, 32000, None, 45000, 78000],
    'Department': ['HR', 'IT', 'IT', 'HR', 'HR'],
    'Purchased': ['Yes', 'No', 'No', 'Yes', 'Yes'],
    'Gender': ['Female', 'Female', 'Male', 'Female', 'Male']
}

df = pd.DataFrame(data)
print("Before preprocessing dataset:\n", df)

# Handle missing values (mean imputation for Age & Salary)
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])

# Label encoding for all categorical columns
label_encoder = LabelEncoder()
for col in ['Department', 'Purchased', 'Gender']:
    df[col] = label_encoder.fit_transform(df[col])

# Standard scaling for numerical columns
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

print("\nAfter preprocessing (only numerical data):\n", df)

# Splitting features and target
X = df.drop('Purchased', axis=1)
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nX_train:\n", X_train)
print("\ny_train:\n", y_train)


Before preprocessing dataset:
     Age   Salary Department Purchased  Gender
0  30.0  35500.0         HR       Yes  Female
1  15.0  32000.0         IT        No  Female
2  65.0      NaN         IT        No    Male
3   NaN  45000.0         HR       Yes  Female
4  23.0  78000.0         HR       Yes    Male

After preprocessing (only numerical data):
         Age    Salary  Department  Purchased  Gender
0 -0.190404 -0.746048           0          1       0
1 -1.069191 -0.961402           1          0       0
2  1.860100  0.000000           1          0       1
3  0.000000 -0.161516           0          1       0
4 -0.600505  1.868965           0          1       1

X_train:
         Age    Salary  Department  Gender
4 -0.600505  1.868965           0       1
2  1.860100  0.000000           1       1
0 -0.190404 -0.746048           0       0
3  0.000000 -0.161516           0       0

y_train:
 4    1
2    0
0    1
3    1
Name: Purchased, dtype: int64
