## sklearn feature selection

In [None]:
# Selecting important features using SelectFromModel
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# Define the random forest model and fit to the training data
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
rf.fit(X_train, y_train)

# Define the feature selection object
model = SelectFromModel(rf, prefit=True)

# Transform the training features
X_train_transformed = model.transform(X_train)

original_features = df.columns[:-1]
print(f"Original features: {original_features}")

# Select the features deemed important by the SelectFromModel
features_bool = model.get_support()

selected_features = X_train.columns[features_bool]
print(f"\nSelected features: {selected_features}")

feature_importance = pd.DataFrame({
    "feature": selected_features,
    "importance": rf.feature_importances_[features_bool]
})
plt.figure(figsize=(10, 6))
plt.barh(feature_importance["feature"], feature_importance["importance"])
plt.show()

### SVC model in sklearn

In [None]:
# SVC model
# Import required modules
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Split the data into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=42)

# Define the SVM / SVC model
svc_model = SVC(kernel='linear')
svc_model.fit(X_train, y_train)

# Get predictions from the model
y_pred = svc_model.predict(X_test)
print(y_pred)

### kfold cross-validation with sklearn

In [None]:
# kfold cross-validation
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix

# Create a KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Get the train and test data from the first split from the shuffled KFold
train_data_split, test_data_split = next(kfold.split(df_X))

# Compute the cross-validation score
score = cross_val_score(model, df_X, df_y, scoring='balanced_accuracy', cv=kfold)
print(score)

# Get model predictions
y_pred = model.predict(df_X)

# Print confusion matrix
cm = confusion_matrix(df_y, y_pred)
print(cm)

### mlflow

In [None]:
import mlflow
# Initialize the MLflow experiment
mlflow.set_experiment("Logistic Regression Prediction")

# Start a run, log model coefficients and intercept
with mlflow.start_run():
    for idx, coef in enumerate(model.coef_[0]):
        mlflow.log_param(f"coef_{idx}", coef)
    mlflow.log_param("intercept", model.intercept_[0])
	
    run_id = mlflow.active_run().info.run_id
    print(run_id)

### kolmogorov-smirnov test with scipy and sklearn

In [None]:
# datadrift detection using kolmogorov-smirnov test
# Import the ks_2samp function
from scipy.stats import ks_2samp
from sklearn.metrics import balanced_accuracy_score

# Calculate and print the balanced accuracy of the model
balanced_accuracy_jan = 60.0
balanced_accuracy_feb = balanced_accuracy_score(true_labels, predicted_labels) * 100
print(f"Model Balanced Accuracy In February: {balanced_accuracy_feb:.2f}%")
print(f"Is there a decline in accuracy? {'Yes' if balanced_accuracy_feb < balanced_accuracy_jan else 'No'}")

# Use the Kolmogorov-Smirnov test to check for data drift
ks_statistic, p_value = ks_2samp(jan_data_samples, feb_data_samples)

significant_drift = p_value < 0.05

print(f"Kolmogorov-Smirnov Statistic: {ks_statistic:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Is there significant data drift? {'Yes' if significant_drift else 'No'}")