In [12]:
import numpy as np
import pandas as pd
from typing import Tuple, List

CONFIG = {
    'DATASET_PATH': 'dataProductivity Prediction of Garment Employeese.csv',
    'TARGET_COLUMN': 'actual_productivity',
    'CATEGORICAL_COLS': ['day', 'department', 'quarter'],
}


In [13]:
def load_dataset(filepath: str) -> pd.DataFrame:
    """Load dataset from CSV."""
    df = pd.read_csv(filepath)
    return df

df = load_dataset(CONFIG['DATASET_PATH'])


In [14]:
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """Remove extreme outliers using bounds."""
    df = df[(df['over_time'] >= -6840.0) & (df['over_time'] <= 15240.0)]
    return df

df = remove_outliers(df)


In [15]:
def clean_categorical(df: pd.DataFrame, categorical_cols: List[str]) -> pd.DataFrame:
    """Strip whitespace and fix categorical values."""
    df = df.copy()
    
    for col in categorical_cols:
        df[col] = df[col].astype(str).str.strip()
    
    df['quarter'] = df['quarter'].replace({'Quarter5': 'Quarter4'})
    
    return df

df = clean_categorical(df, CONFIG['CATEGORICAL_COLS'])


In [16]:
def convert_datetime(df: pd.DataFrame) -> pd.DataFrame:
    """Convert date column to datetime."""
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    return df

df = convert_datetime(df)


In [17]:
def impute_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Impute numeric columns: mean for most, median for wip."""
    df = df.copy()
    
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    
    for col in numeric_cols:
        if col == 'wip':
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mean(), inplace=True)
    
    return df

df = impute_numeric(df)


In [18]:
def impute_categorical(df: pd.DataFrame, categorical_cols: List[str]) -> pd.DataFrame:
    """Impute categorical columns with mode."""
    df = df.copy()
    
    for col in categorical_cols:
        df[col] = df[col].replace('nan', np.nan)
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

df = impute_categorical(df, CONFIG['CATEGORICAL_COLS'])


In [19]:
def remove_missing_target(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    """Drop rows with missing target variable."""
    df = df.dropna(subset=[target_col])
    return df

df = remove_missing_target(df, CONFIG['TARGET_COLUMN'])


In [20]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create interaction and ratio features."""
    df = df.copy()
    
    df['wip_per_worker'] = df['wip'] / df['no_of_workers'].replace(0, 1)
    df['over_time_per_worker'] = df['over_time'] / df['no_of_workers'].replace(0, 1)
    df['idle_time_per_worker'] = df['idle_time'] / df['idle_men'].replace(0, 1)
    df['wip_x_workers'] = df['wip'] * df['no_of_workers']
    df['incentive_per_worker'] = df['incentive'] / (df['no_of_workers'] + 1e-6)
    df['target_prod_ratio'] = df['targeted_productivity'] / (df[CONFIG['TARGET_COLUMN']] + 1e-6)
    
    return df

df = engineer_features(df)


In [21]:
def get_feature_columns(df: pd.DataFrame, target_col: str, exclude_cols: List[str] = None) -> List[str]:
    """Get all feature columns (exclude target and specified columns)."""
    if exclude_cols is None:
        exclude_cols = ['date']
    
    exclude_cols.append(target_col)
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    return feature_cols

feature_cols = get_feature_columns(df, CONFIG['TARGET_COLUMN'])


In [22]:
def prepare_data_full(filepath: str, target_col: str, categorical_cols: List[str]) -> Tuple[pd.DataFrame, List[str]]:
    """
    One-function data preparation pipeline.
    Returns: cleaned dataframe, feature columns list
    """
    df = load_dataset(filepath)
    df = remove_outliers(df)
    df = clean_categorical(df, categorical_cols)
    df = convert_datetime(df)
    df = impute_numeric(df)
    df = impute_categorical(df, categorical_cols)
    df = remove_missing_target(df, target_col)
    df = engineer_features(df)
    
    feature_cols = get_feature_columns(df, target_col)
    
    return df, feature_cols

# Usage
df_clean, features = prepare_data_full(
    CONFIG['DATASET_PATH'], 
    CONFIG['TARGET_COLUMN'], 
    CONFIG['CATEGORICAL_COLS']
)
