In [None]:
import pandas as pd
import os
import numpy as np

from sklearn.preprocessing import StandardScaler,  FunctionTransformer

from sklearn.model_selection import train_test_split, KFold, cross_val_score, RandomizedSearchCV

import joblib
import json
import cloudpickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer

In [23]:
save_path = 'classification_package'

file_path = os.path.join("qws1","data.csv")

df = pd.read_csv(file_path)

X = df.drop(columns='Class')
Y = df['Class']

X_rest, X_test, Y_rest, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)


if X_rest is not None and X_test is not None:
    joblib.dump(X_rest, os.path.join(save_path, 'X_rest.pkl'))
    joblib.dump(X_test, os.path.join(save_path, 'X_test.pkl'))
    
if Y_rest is not None and Y_test is not None:
    joblib.dump(Y_rest, os.path.join(save_path, 'Y_rest.pkl'))
    joblib.dump(Y_test, os.path.join(save_path, 'Y_test.pkl'))


In [None]:

# Original feature lists
numeric_features = ['Availability', 'Throughput', 'Successability',
                    'Reliability', 'Compliance', 'Documentation']
categorical_features = ['Service Name']

# --- Custom Winsorizer function ---
def winsorize(X_ori):

    threshold = 3
    X_used = X_ori.copy()

    for col in range(X_used.shape[1]):
        Q1 = np.percentile(X_used[:, col], 25)
        Q3 = np.percentile(X_used[:, col], 75)
        lower = Q1 - 1.5 * (Q3 - Q1)
        upper = Q3 + 1.5 * (Q3 - Q1)

        # Clip extremes
        X_used[:, col] = np.clip(X_used[:, col], lower, upper)

        # Z-score check
        mean_val = X_used[:, col].mean()
        std_val = X_used[:, col].std()
        z_scores = (X_used[:, col] - mean_val) / std_val
        mask = np.abs(z_scores) < threshold
        median_val = np.median(X_used[:, col])
        X_used[:, col] = np.where(mask, X_used[:, col], median_val)

    return X_used


winsorizer = FunctionTransformer(winsorize, validate=False)


cats_to_numeric = Pipeline(steps=[
    ("imputer_cat", SimpleImputer(strategy="most_frequent")),
    ("encoder", BinaryEncoder(handle_unknown="ignore"))
])


from sklearn.base import BaseEstimator, TransformerMixin
class ColumnDropperByName(BaseEstimator, TransformerMixin):
    def __init__(self, patterns=None):
        self.patterns = patterns

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_ = X.columns.tolist()
        else:
            self.feature_names_ = None
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_)
        if self.patterns:
            drop_cols = [col for col in X.columns if any(col.endswith(p) for p in self.patterns)]
            X = X.drop(columns=drop_cols, errors='ignore')

        return X.values


pre_as_numeric = ColumnTransformer(transformers=[
    ("num_passthrough", "passthrough", numeric_features),
    ("cat_to_numeric", cats_to_numeric, categorical_features),
], remainder="drop", sparse_threshold=0.0)


def to_dataframe(X):
    # Get feature names from ColumnTransformer
    feature_names = pre_as_numeric.get_feature_names_out()
    return pd.DataFrame(X, columns=feature_names)

as_dataframe = FunctionTransformer(to_dataframe, validate=False)



pipeline_all = Pipeline(steps=[
    ("to_numeric", pre_as_numeric),
    ("as_dataframe", as_dataframe),
    ("drop_cols", ColumnDropperByName(patterns=['_4','_5','_6','_7','_8'])),
    ("winsorizer", winsorizer),
    ("imputer_median", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


pipeline_all.fit(X_rest)

#print(pre_as_numeric.get_feature_names_out())
print(pipeline_all.named_steps['as_dataframe'].transform(pre_as_numeric.transform(X_rest)).columns)


# Save
with open(os.path.join(save_path, 'preprocessor.pkl'), 'wb') as f:
    cloudpickle.dump(pipeline_all, f)

#joblib.dump(pipeline_all, os.path.join(save_path, 'preprocessor.pkl'))

Index(['num_passthrough__Availability', 'num_passthrough__Throughput',
       'num_passthrough__Successability', 'num_passthrough__Reliability',
       'num_passthrough__Compliance', 'num_passthrough__Documentation',
       'cat_to_numeric__0_0', 'cat_to_numeric__0_1', 'cat_to_numeric__0_2',
       'cat_to_numeric__0_3', 'cat_to_numeric__0_4', 'cat_to_numeric__0_5',
       'cat_to_numeric__0_6', 'cat_to_numeric__0_7', 'cat_to_numeric__0_8'],
      dtype='object')


  z_scores = (X_used[:, col] - mean_val) / std_val


In [None]:

# Original feature lists
numeric_features = ['Availability', 'Throughput', 'Successability',
                    'Reliability', 'Compliance', 'Documentation']
categorical_features = ['Service Name']

# --- Custom Winsorizer function ---
def winsorize(X_ori):

    threshold = 3
    X_used = X_ori.copy()

    for col in range(X_used.shape[1]):
        Q1 = np.percentile(X_used[:, col], 25)
        Q3 = np.percentile(X_used[:, col], 75)
        lower = Q1 - 1.5 * (Q3 - Q1)
        upper = Q3 + 1.5 * (Q3 - Q1)

        # Clip extremes
        X_used[:, col] = np.clip(X_used[:, col], lower, upper)

        # Z-score check
        mean_val = X_used[:, col].mean()
        std_val = X_used[:, col].std()
        z_scores = (X_used[:, col] - mean_val) / std_val
        mask = np.abs(z_scores) < threshold
        median_val = np.median(X_used[:, col])
        X_used[:, col] = np.where(mask, X_used[:, col], median_val)

    return X_used


winsorizer = FunctionTransformer(winsorize, validate=False)


cats_to_numeric = Pipeline(steps=[
    ("imputer_cat", SimpleImputer(strategy="most_frequent")),
    ("encoder", BinaryEncoder(handle_unknown="ignore"))
])


from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropperByName(BaseEstimator, TransformerMixin):
    def __init__(self, drop_names=None):
        self.drop_names = drop_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # If X is numpy, convert to DataFrame with generic names
        
        if self.drop_names:
            X = X.drop(columns=self.drop_names, errors='ignore')
        return X

# --- Step 2: Combine numeric + encoded categorical ---
pre_as_numeric = ColumnTransformer(transformers=[
    ("num_passthrough", "passthrough", numeric_features),
    ("cat_to_numeric", cats_to_numeric, categorical_features),
], remainder="drop", sparse_threshold=0.0)

def to_dataframe(X):
    # Get feature names from ColumnTransformer
    feature_names = pre_as_numeric.get_feature_names_out()
    return pd.DataFrame(X, columns=feature_names)

as_dataframe = FunctionTransformer(to_dataframe, validate=False)

pipeline_all = Pipeline(steps=[
    ("to_numeric", pre_as_numeric),
    ("as_dataframe", as_dataframe),
    ("drop_cols", ColumnDropperByName(drop_names=['cat_to_numeric__encoder__0_4',
                                                  'cat_to_numeric__encoder__0_5',
                                                  'cat_to_numeric__encoder__0_6',
                                                  'cat_to_numeric__encoder__0_7',
                                                  'cat_to_numeric__encoder__0_8'])),
    ("winsorizer", winsorizer),
    ("imputer_median", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


# Fit pipeline
pipeline_all.fit(X_rest)

print(pre_as_numeric.get_feature_names_out())

# Save pipeline
joblib.dump(pipeline_all, os.path.join(save_path, 'preprocessor.pkl'))
