In [None]:
# TASK 1 - DATA PIPELINE (ETL)
# CODTECH Internship - Patel Jiii (CT08DR2597)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os
import sys

sys.path.append("../src")
from utils import ensure_dir

DATA_DIR = "../data"
ensure_dir(DATA_DIR)

# --------------------------
# CREATE SAMPLE DATASET
# --------------------------
df = pd.DataFrame({
    "age": [25, 30, np.nan, 45, 29, 34, 41, np.nan],
    "salary": [50000, 60000, 58000, 75000, 52000, np.nan, 90000, 62000],
    "department": ["sales","hr","hr","eng","sales","eng",np.nan,"sales"],
    "target": [0,1,0,1,0,1,1,0]
})

df.to_csv("../data/sample_tabular.csv", index=False)
df


In [None]:
df = pd.read_csv("../data/sample_tabular.csv")

num_cols = ["age", "salary"]
cat_cols = ["department"]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)

X_train_p[:5]


In [None]:
train_path = "../data/train.csv"
test_path = "../data/test.csv"

train_df = pd.DataFrame(X_train_p.toarray() if hasattr(X_train_p, "toarray") else X_train_p)
train_df["target"] = y_train.values
train_df.to_csv(train_path, index=False)

test_df = pd.DataFrame(X_test_p.toarray() if hasattr(X_test_p, "toarray") else X_test_p)
test_df["target"] = y_test.values
test_df.to_csv(test_path, index=False)

train_df.head()
