### Practice Exercise (Data Preprocessing)

In [None]:
# Setup paths to access configuration and dataset files from parent directories
from pathlib import Path
import sys

# Get current working directory
CURRENT_DIR = Path.cwd()

# Get the project root directory (parent of current)
project_root = CURRENT_DIR.parent

sys.path.insert(0, str(project_root))

# Import dataset paths from config file
from config import DATASET_DIR, MOBILE_CUSTOMERS_DATA

In [None]:
# Import data processing and ML preprocessing tools
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [None]:
# Load mobile customers dataset and check its size and missing values
df = pd.read_csv(MOBILE_CUSTOMERS_DATA)
print(df.shape)
df.isnull().sum()

(20, 9)


CustomerID           0
Age                  3
Gender               2
Income               2
CreditScore          2
PreviousPurchases    2
InternetUsage        1
Location             1
PremiumPurchase      0
dtype: int64

In [None]:
# Extract features (X) by dropping ID and target, and target variable (y) as PremiumPurchase
X = df.drop(['CustomerID','PremiumPurchase'],axis=1,errors='ignore').values
y = df['PremiumPurchase'].values

In [None]:
# Define which columns are categorical (text) and numerical (numbers) for targeted processing
categorial_data = ['Gender','InternetUsage','Location']
integerial_data = ['Age','Income','CreditScore','PreviousPurchases']
df.head(1)

Unnamed: 0,CustomerID,Age,Gender,Income,CreditScore,PreviousPurchases,InternetUsage,Location,PremiumPurchase
0,C001,25.0,Male,35000.0,680.0,2.0,High,Urban,No


In [None]:
# Fill missing values: mean for numerical columns, most frequent for categorical columns
impute_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
X[:,[0,2,3]] = impute_mean.fit_transform(X[:,[0,2,3]])

impute_frequent = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
X[:,[1,4,5,6]] = impute_frequent.fit_transform(X[:,[1,4,5,6]])

In [None]:
# Convert categorical features to numerical using OneHotEncoder, encode target labels
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),[1,5,6])],remainder='passthrough')
X = np.array(ct.fit_transform(X))

lb = LabelEncoder()
y = lb.fit_transform(y)

In [None]:
# Split data: 80% training (learn patterns), 20% testing (evaluate performance)
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=47)

In [None]:
# Standardize the last 4 numerical features to prevent larger values from dominating the model
sc = StandardScaler()
X_train[:,-4:] = sc.fit_transform(X_train[:,-4:])

X_test[:,-4:] = sc.transform(X_test[:,-4:])