In [16]:
import pandas as pd
import os
import numpy as np

from sklearn.preprocessing import StandardScaler,  FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

import joblib
import json
import cloudpickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer

# Regression Part

In [79]:
# Load data
file_path = os.path.join("data","processed_data.csv")

df = pd.read_csv(file_path)  

print(len(df))
df.head()

X=df.drop(columns=['Response Time', 'Latency', 'Service Name_0', 'Service Name_1', 'Service Name_2', 'Service Name_3', 'Compliance', 'Class', 'WsRF'])
Y=df['Response Time']


X_rest, X_test, y_rest, y_test = train_test_split(
    X, Y, test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_rest, y_rest, test_size=0.53, random_state=42
)

364


In [80]:
save_path = 'regression_package'

numeric_features = ['Availability', 'Throughput', 'Successability',
                    'Reliability', 'Best Practices', 'Documentation']


def winsorize(X):
    X = np.asarray(X, dtype=float)
    threshold = 3
    Xw = X.copy()
    for j in range(Xw.shape[1]):
        col = Xw[:, j]
        # IQR clip
        Q1 = np.percentile(col, 25)
        Q3 = np.percentile(col, 75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        col = np.clip(col, lower, upper)

        # z-score threshold
        m = np.mean(col)
        s = np.std(col)
        if s == 0:
            Xw[:, j] = col
            continue
        z = (col - m) / s
        med = np.median(col)
        col = np.where(np.abs(z) < threshold, col, med)
        Xw[:, j] = col
    return Xw

winsorizer = FunctionTransformer(winsorize, validate=False)


numeric_transformer = Pipeline(steps=[
    ("winsorizer", winsorizer),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features)
], remainder='drop')  

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])

pipeline.fit(X_rest,Y_rest)

with open(os.path.join(save_path, 'pipeline.pkl'), 'wb') as f:
    cloudpickle.dump(pipeline, f)

# Classification part

In [25]:
save_path = 'classification_package'

file_path = os.path.join("qws1","data.csv")

df = pd.read_csv(file_path)

X = df.drop(columns='Class')
Y = df['Class']

X_rest, X_test, Y_rest, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42, stratify = Y)


if X_rest is not None and X_test is not None:
    joblib.dump(X_rest, os.path.join(save_path, 'X_rest.pkl'))
    joblib.dump(X_test, os.path.join(save_path, 'X_test.pkl'))
    
if Y_rest is not None and Y_test is not None:
    joblib.dump(Y_rest, os.path.join(save_path, 'Y_rest.pkl'))
    joblib.dump(Y_test, os.path.join(save_path, 'Y_test.pkl'))


In [75]:
# --- Features ---
numeric_features = ['Availability', 'Throughput', 'Successability',
                    'Reliability', 'Compliance', 'Documentation']
categorical_features = ['Service Name']

# --- Winsorizer (safer order: after imputation) ---
def winsorize(X):
    X = np.asarray(X, dtype=float)
    threshold = 3
    Xw = X.copy()
    for j in range(Xw.shape[1]):
        col = Xw[:, j]
        # IQR clip
        Q1 = np.percentile(col, 25)
        Q3 = np.percentile(col, 75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        col = np.clip(col, lower, upper)

        # z-score threshold
        m = np.mean(col)
        s = np.std(col)
        if s == 0:
            Xw[:, j] = col
            continue
        z = (col - m) / s
        med = np.median(col)
        col = np.where(np.abs(z) < threshold, col, med)
        Xw[:, j] = col
    return Xw

winsorizer = FunctionTransformer(winsorize, validate=False)

# --- Categorical pipeline (no scaling) ---
cats_to_numeric = Pipeline(steps=[
    ("imputer_cat", SimpleImputer(strategy="most_frequent")),
    ("encoder", BinaryEncoder(handle_unknown="ignore"))
])


# --- ColumnTransformer: scale numeric only ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer_num", SimpleImputer(strategy="median")),
            ("winsor", winsorizer),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", cats_to_numeric, categorical_features)
    ],
    remainder="drop",
    sparse_threshold=0.0
)

# --- Drop columns by index (after transformation) ---
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, drop_indices=None):
        self.drop_indices = drop_indices

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.drop_indices:
            return np.delete(X, self.drop_indices, axis=1)
        return X

# Fit preprocessor to compute feature names and select indices to drop
preprocessor.fit(X_rest)

cats_to_numeric.fit(X_rest[categorical_features])
feat_names = cats_to_numeric.named_steps['encoder'].get_feature_names()
print(feat_names)
suffixes_to_drop = ('_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8')


num_count = len(numeric_features)
drop_indices = [num_count + i for i, n in enumerate(feat_names) if n.endswith(suffixes_to_drop)]

# Build final pipeline: preprocess â†’ drop selected columns
pipeline_all = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("drop_cols", ColumnDropper(drop_indices=drop_indices)),
    ("final_imputer", SimpleImputer(strategy="median"))  # or "most_frequent"
])

# Fit the pipeline on your reference/train data
pipeline_all.fit(X_rest)

# Save
import cloudpickle, os
save_path = 'classification_package'
os.makedirs(save_path, exist_ok=True)
with open(os.path.join(save_path, 'preprocessor.pkl'), 'wb') as f:
    cloudpickle.dump(pipeline_all, f)


['0_0' '0_1' '0_2' '0_3' '0_4' '0_5' '0_6' '0_7' '0_8']


  feat_names = cats_to_numeric.named_steps['encoder'].get_feature_names()
