# Load required libs

In [1]:
%cd ..
%pwd
import sys
sys.path.append('./src')
from src.data_processor import RawDataProcessor
from src.problem_config import create_prob_config
from src.drift_detector import ks_drift_detect
prob_config = create_prob_config("phase-3", "prob-2")

c:\VENV\api_prediction


In [2]:
from mlflow.models.signature import infer_signature
import mlflow

def log_model_to_tracker(model, metrics, desc):
    MLFLOW_TRACKING_URI = 'http://192.168.88.113:5000'
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment("phase-3_prob-2_lgbm")
    MLFLOW_MODEL_PREFIX = "model"
    mlflow.start_run(description=desc)
    mlflow.log_metrics(metrics)
    mlflow.log_params(model.get_params())
    signature = infer_signature(test_x.astype(np.float64), predictions)
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=MLFLOW_MODEL_PREFIX,
        signature=signature,
        pip_requirements ='src/requirements.txt'
        #registered_model_name="phase-1_prob-1_model-1"
    )

    experimentid = mlflow.active_run().info.run_id
    mlflow.end_run()
    return experimentid

# Prepare datasets

In [149]:
# Import xtran and ytrain
import pandas as pd
import numpy as np
import pickle

training_data0 = pd.read_parquet(prob_config.raw_data_path)
training_data0 = training_data0.drop_duplicates()

training_data0, category_index = RawDataProcessor.build_category_features(
            training_data0, prob_config.categorical_cols
        )

with open(prob_config.category_index_path, "wb") as f:
    pickle.dump(category_index, f)

In [151]:
conflict_labels = training_data0[training_data0.duplicated(prob_config.feature_cols, keep=False)].sort_values(by=prob_config.feature_cols)
conflict_labels["org_idx"] = conflict_labels.index
data_conflict = conflict_labels.groupby(prob_config.feature_cols).agg({"org_idx": lambda x: tuple(x), "label": lambda x: tuple(x)}).reset_index()
#apply(lambda x: tuple(x.index)).to_list()

In [152]:
from sklearn.model_selection import train_test_split

training_data = training_data0.drop_duplicates(subset=prob_config.feature_cols, keep=False)
target_col = prob_config.target_col
train_x0 = training_data.drop([target_col], axis=1)
train_y0 = training_data[[target_col]]

train, dev = train_test_split(training_data, test_size=0.1, random_state=123)

train_x = train.drop(["label"], axis=1)
train_y = train[[target_col]]
test_x = dev.drop(["label"], axis=1)
test_y = dev[[target_col]]

In [153]:
labels_dict = {}
labels_unq = train_y0['label'].unique()
labels_unq.sort()
for i in range(len(labels_unq)):
    labels_dict[labels_unq[i]] = i

inv_labels_dict = {v: k for k, v in labels_dict.items()}
model_classes_path = prob_config.data_path / 'classes.npy'
np.save(model_classes_path, labels_unq)

In [300]:
from xgboost import XGBClassifier

model0 = XGBClassifier(objective="multi:softprob", random_state=123)
model0.fit(train_x0, train_y0.replace(labels_dict), verbose=False)#, eval_set=eval_set)

In [154]:
from lightgbm import LGBMClassifier


eval_set = [(train_x, train_y), (test_x, test_y)]
model0 = LGBMClassifier(objective="multiclass", random_state=123)
model0.fit(train_x0, train_y0, verbose=False)#, eval_set=eval_set)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
#import lightgbm as lgb

#lgb.plot_metric(model0)

In [157]:
data22 = pd.read_parquet('data/phase-2/prob-2/raw_train.parquet')

data22 = RawDataProcessor.apply_category_features(
    raw_df=data22,
    categorical_cols=prob_config.categorical_cols,
    category_index=category_index,
)

data22.drop_duplicates(inplace=True)

In [158]:
conflict_labels2 = data22[data22.duplicated(prob_config.feature_cols, keep=False)].sort_values(by=prob_config.feature_cols)
conflict_labels2["org_idx"] = conflict_labels2.index
conflict_labels2.groupby(prob_config.feature_cols).agg({"org_idx": lambda x: tuple(x), "label": lambda x: tuple(x)}).reset_index()[["org_idx","label"]]

Unnamed: 0,org_idx,label
0,"(18610, 61207)","(Exploits, Denial of Service)"
1,"(16762, 25916, 46019)","(Denial of Service, Exploits, Information Gath..."
2,"(10701, 59138, 59472)","(Exploits, Information Gathering, Denial of Se..."
3,"(32122, 43251, 57080)","(Information Gathering, Denial of Service, Exp..."
4,"(4625, 12609, 23488, 40864)","(Malware, Denial of Service, Exploits, Informa..."
...,...,...
1659,"(2052, 12187, 56610)","(Denial of Service, Exploits, Information Gath..."
1660,"(13656, 37184, 50897)","(Exploits, Information Gathering, Denial of Se..."
1661,"(6358, 11822, 55169)","(Exploits, Denial of Service, Information Gath..."
1662,"(2594, 12669)","(Exploits, Denial of Service)"


In [331]:
data22.drop_duplicates(subset=prob_config.feature_cols, keep=False, inplace=True)

train_x_new = pd.DataFrame(np.concatenate((train_x0, data22[prob_config.feature_cols])), columns=train_x.columns)
train_y_new = pd.DataFrame(np.concatenate((train_y0, data22[[prob_config.target_col]])), columns=train_y.columns)

model1 = XGBClassifier(objective="multi:softprob", random_state=123)
#LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
model1.fit(train_x_new, train_y_new.replace(labels_dict), verbose=False)

In [332]:
from sklearn.metrics import accuracy_score
predictions = model1.predict(test_x)
#predictions = s.predict_model(best[0], data = test_x)["prediction_label"]
accuracy = accuracy_score(predictions, test_y.replace(labels_dict))
metrics = {"accuracy_score": accuracy}
print(f"metrics: {metrics}")

metrics: {'accuracy_score': 0.9208368200836821}


In [333]:
data22.drop_duplicates(subset=prob_config.feature_cols, keep=False, inplace=True)
accuracy_score(model1.predict(training_data.drop([target_col], axis=1)), training_data[[target_col]].replace(labels_dict))

0.9203240601252051

In [153]:
import lleaves
model_path = ".venv/phase2_2_lgbm.txt"
model0.booster_.save_model(filename=model_path)
llvm_model = lleaves.Model(model_file=model_path)
llvm_model.compile()

In [192]:
%%timeit -n 10
z = llvm_model.predict(test_x)
labels = np.argmax(z, axis=1)
classes = model0.classes_
labels = [classes[i] for i in labels]

16 ms ± 360 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [357]:
run_description = """
### Header
XGB model, First Base Model Prob2
Model: XGB
    """
log_model_to_tracker(model2, metrics, run_description)

'0d3e6b94bc8949f7b2e5e5bc6e1ed979'

# Drift Detection 

In [10]:
# Save referent for drift detection.
X_baseline = train_x0.sample(1000)
X_baseline_df = pd.DataFrame(X_baseline, columns=prob_config.drift_cols)
X_baseline_df.to_parquet(prob_config.driff_ref_path, index=False)

# Model drift

In [195]:
import pandas as pd
import redis
import pickle
# Load data for problem 1
rc2 = redis.Redis(host='192.168.88.113', db=2, port=6379)

captured_x = pd.DataFrame()
for key in rc2.keys():
    captured_data = pickle.loads(rc2.get(key))
    captured_x = pd.concat([captured_x, captured_data])

captured_x.drop_duplicates(inplace=True, ignore_index=True)

captured_x = RawDataProcessor.apply_category_features(
    raw_df=captured_x[train_x0.columns],
    categorical_cols=prob_config.categorical_cols,
    category_index=category_index,
)

In [120]:
rc2.flushdb()

True

In [196]:
len(rc2.keys())

101

In [128]:
#np_captured_x = np_captured_x.astype(train_x.dtypes.to_dict())

In [334]:
from sklearn.cluster import MiniBatchKMeans
import joblib
import numpy as np
# N là số lượng cluster
N = 30000
# Train clustering model cho data đã có label
kmeans = MiniBatchKMeans(n_clusters=N, random_state=0, n_init='auto').fit(train_x0)
joblib.dump(kmeans, 'data/phase-3/prob-2/kmeans.cpk')

['data/phase-3/prob-2/kmeans.cpk']

In [336]:
import joblib
import numpy as np

N = 30000
kmeans = joblib.load('data/phase-3/prob-2/kmeans.cpk')
new_labels = []

# Duyệt từng cluster
for  i  in  range(N):
	# Lấy các label của các data point thuộc cluster i
	mask = (kmeans.labels_ == i)
	cluster_labels = train_y0[mask]

	if  len(cluster_labels) == 0:
		# Nếu cluster i rỗng thì xác định cluster i ánh xạ với 1 label mặc định (ở đây lựa chọn là 0)
		new_labels.append(np.nan)
	else:
		# Tìm label mới cho cả cụm cluster trong trường hợp cụm cluster khác rỗng
		#if  isinstance(train_y.flatten()[0], float):
			# Nếu là bài toán Regression thì lấy giá trị trung bình của các label thuộc cluster
		#	new_labels.append(np.mean(cluster_labels.flatten()))
		#else:
			# Nếu là bài toán Classification thì lấy label xuất hiện nhiều nhất trong cluster
			new_labels.append(cluster_labels.value_counts().idxmax()[0])

# Ánh xạ lại label cho data drift dựa trên kết quả predict cluster ở trên
y_drift_propagated = [new_labels[c] for  c  in  kmeans.labels_]

In [338]:
y_drift_test_propagated = [ new_labels[c] for c in kmeans.predict(test_x)]
from sklearn.metrics import accuracy_score
accuracy_score(y_drift_test_propagated, test_y)

0.6230322518717604

In [None]:
fix_label_kmean = []
for i in range(len(y_drift_test_propagated)):
    if y_drift_test_propagated[i] in data_conflict['label'][i]:
        fix_label_kmean.append(y_drift_test_propagated[i])
    else:
        fix_label_kmean.append(np.nan)

In [242]:
newdata = data_conflict[prob_config.feature_cols]
newdata['label'] = fix_label_kmean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata['label'] = fix_label_kmean


In [346]:
captured_x['label'] = [ new_labels[c] for c in kmeans.predict(captured_x[prob_config.feature_cols])]
captured_x.dropna(inplace=True)

train_x_new = pd.DataFrame(np.concatenate((train_x0, captured_x[prob_config.feature_cols])), columns=train_x.columns)
train_y_new = pd.DataFrame(np.concatenate((train_y0, captured_x[[prob_config.target_col]])), columns=train_y.columns)

model2 = XGBClassifier(objective="multi:softprob", random_state=123)
#LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
model2.fit(train_x_new, train_y_new.replace(labels_dict), verbose=False)

In [356]:
accuracy_score(model2.predict(test_x), test_y.replace(labels_dict))

0.92

In [353]:
accuracy_score(model2.predict(captured_x[prob_config.feature_cols]), captured_x[[prob_config.target_col]].replace(labels_dict))

0.7651180248561181

In [355]:
accuracy_score(model2.predict(captured_x[prob_config.feature_cols]), model1.predict(captured_x[prob_config.feature_cols].replace(labels_dict)))

0.8501960130119276

In [374]:
import daal4py as d4p
daal_model = d4p.get_gbt_model_from_xgboost(model2._Booster)

In [393]:
d4p_cls_algo = d4p.gbt_classification_prediction(
    nClasses=len(labels_unq),
    resultsToEvaluate="computeClassLabels",
    fptype='float'
)

In [398]:
%%timeit
d4p_cls_algo.compute(test_x, daal_model).prediction.T[0].astype(np.int64)

12.1 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [396]:
model0.predict(test_x)

array([0, 0, 4, ..., 1, 4, 0], dtype=int64)

# Backup code

```
conflict_labels = data_conflict['label']
proba_fix = pd.DataFrame(model0.predict_proba(data_conflict[prob_config.feature_cols]), columns=labels_unq)
fix_label = []
for i in range(len(conflict_labels)):
     labels = conflict_labels[i]
     fix_label.append(labels[proba_fix[list(labels)].iloc[i].argmax()])

fix_label = pd.DataFrame(fix_label)

train_x_new = pd.DataFrame(np.concatenate((train_x0, data_conflict[prob_config.feature_cols])), columns=train_x.columns)
train_y_new = pd.DataFrame(np.concatenate((train_y0, fix_label)), columns=train_y.columns)

model1 = XGBClassifier(objective="multi:softprob", random_state=123)
#LGBMClassifier(objective="binary", random_state=123, is_unbalance=True)
model1.fit(train_x_new, train_y_new.replace(labels_dict), verbose=False) ```