#2. Perform Data exploration and preprocessing in Python


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# --- STEP 1: LOAD DATA (Creating a dummy dataset for the lab) ---
data = {
'Country': ['India', 'USA', 'India', 'USA', 'UK', 'India'],
'Age': [22, 25, np.nan, 30, 28, 35], # Note the 'nan' (missing value)
'Salary': [40000, 60000, 50000, np.nan, 72000, 58000],
'Purchased': ['No', 'Yes', 'Yes', 'No', 'Yes', 'Yes']
         }
df = pd.DataFrame(data)
print("--- ORIGINAL RAW DATA ---")
print(df)
print("\n")
# --- STEP 2: INSPECT DATA ---
# Check for missing values
print("--- MISSING VALUES COUNT ---")
print(df.isnull().sum())
print("\n")
# --- STEP 3: CLEAN DATA (Handling Missing Values) ---
# Logic: Fill missing Age/Salary with the Average (Mean) of that column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("--- DATA AFTER CLEANING ---")
print(df)
print("\n")
# --- STEP 4: CONVERT TEXT TO NUMBERS (Encoding) ---
# Logic: Machines can't read 'India' or 'USA'. We convert them to 0s and 1s.
# We use 'One Hot Encoding' (get_dummies)
df_encoded = pd.get_dummies(df, columns=['Country'])
# For the Target column (Purchased), let's map Yes/No manually
df_encoded['Purchased'] = df_encoded['Purchased'].map({'Yes': 1, 'No': 0})
print("--- DATA AFTER ENCODING (All Numbers Now) ---")
print(df_encoded)
print("\n")
# --- STEP 5: SCALE FEATURES ---
# Logic: Age is 20-30, Salary is 40000-60000. Salary dominates Age because it's bigger.
# We shrink them to the same scale.
# Separate Features (X) and Target (y)
X = df_encoded.drop('Purchased', axis=1)
y = df_encoded['Purchased']
# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("--- FINAL PROCESSED DATA (Ready for AI) ---")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

--- ORIGINAL RAW DATA ---
  Country   Age   Salary Purchased
0   India  22.0  40000.0        No
1     USA  25.0  60000.0       Yes
2   India   NaN  50000.0       Yes
3     USA  30.0      NaN        No
4      UK  28.0  72000.0       Yes
5   India  35.0  58000.0       Yes


--- MISSING VALUES COUNT ---
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


--- DATA AFTER CLEANING ---
  Country   Age   Salary Purchased
0   India  22.0  40000.0        No
1     USA  25.0  60000.0       Yes
2   India  28.0  50000.0       Yes
3     USA  30.0  56000.0        No
4      UK  28.0  72000.0       Yes
5   India  35.0  58000.0       Yes


--- DATA AFTER ENCODING (All Numbers Now) ---
    Age   Salary  Purchased  Country_India  Country_UK  Country_USA
0  22.0  40000.0          0           True       False        False
1  25.0  60000.0          1          False       False         True
2  28.0  50000.0          1           True       False        False
3  30.0  56000.0          0   