In [1]:
import wandb
import numpy as np
import os
import random
import glob
import pandas as pd
import time
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, VotingClassifier
from sklearn.datasets import fetch_covtype
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.io import arff
from sklearn.datasets import fetch_openml
from joblib import Parallel, delayed

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbrinashong[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%run 'common.ipynb'

In [4]:
# Shared variables
OUTPUT_FOLDER = 'csv_files'
FEATURE_FOLDER = 'features'
TEST_COUNT = 500

In [5]:
%run covertype.ipynb

Normal class:  0    2
dtype: int32
Feature names:  ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_0', 'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Soil_Type_0', 'Soil_Type_1', 'Soil_Type_2', 'Soil_Type_3', 'Soil_Type_4', 'Soil_Type_5', 'Soil_Type_6', 'Soil_Type_7', 'Soil_Type_8', 'Soil_Type_9', 'Soil_Type_10', 'Soil_Type_11', 'Soil_Type_12', 'Soil_Type_13', 'Soil_Type_14', 'Soil_Type_15', 'Soil_Type_16', 'Soil_Type_17', 'Soil_Type_18', 'Soil_Type_19', 'Soil_Type_20', 'Soil_Type_21', 'Soil_Type_22', 'Soil_Type_23', 'Soil_Type_24', 'Soil_Type_25', 'Soil_Type_26', 'Soil_Type_27', 'Soil_Type_28', 'Soil_Type_29', 'Soil_Type_30', 'Soil_Type_31', 'Soil_Type_32', 'Soil_Type_33', 'Soil_Type_34', 'Soil_Type_35', 'Soil_Type_36', 'Soil_Type_37', 'Soil_Type_38', 'Soil_Type_39']


In [6]:
# Should already be one hot encoded and label encoded
all_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [7]:
# Create a folder to save the CSVs
remove_files_from_directory(OUTPUT_FOLDER)
remove_files_from_directory(FEATURE_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(FEATURE_FOLDER, exist_ok=True)

All files in csv_files have been removed.
All files in features have been removed.


In [8]:
# Save all data as csv
all_df.to_csv('all_data.csv' ,index = False)

In [9]:
# Get X and y from all_df
X_df = all_df.drop(columns=[TARGET_COLUMN])
y_df = all_df[TARGET_COLUMN]

# Split the data into training and test sets
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)
print(X_train_df.shape, X_test_df.shape, y_train_df.shape, y_test_df.shape)

(464809, 54) (116203, 54) (464809,) (116203,)


In [10]:
# Concatenate X_train and y_train along the columns (axis=1)
X_y_train_df = pd.concat([X_train_df, y_train_df], axis=1)
X_y_train_df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_Type
519924,3289.0,22.0,19.0,240.0,93.0,1708.0,205.0,196.0,122.0,2598.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
318451,2963.0,21.0,18.0,134.0,27.0,1243.0,206.0,200.0,127.0,1140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
22325,3037.0,185.0,9.0,127.0,10.0,6462.0,222.0,246.0,158.0,3037.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
449376,3113.0,203.0,13.0,190.0,22.0,2125.0,213.0,251.0,171.0,730.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
482753,3128.0,346.0,9.0,120.0,36.0,552.0,203.0,226.0,161.0,924.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [11]:
# Initialize the standard scaler
scaler = StandardScaler()

# Fit and transform the numeric columns
X_train_scaled_df = X_train_df.copy()
X_train_scaled_df[NUMERICAL_COLUMNS] = scaler.fit_transform(X_train_scaled_df[NUMERICAL_COLUMNS])
# print('X_train_scaled_df', X_train_scaled_df)

# Use the same scaler to transform X_test
X_test_scaled_df = X_test_df.copy()
X_test_scaled_df[NUMERICAL_COLUMNS] = scaler.transform(X_test_scaled_df[NUMERICAL_COLUMNS])
# print('X_test_scaled_df', X_test_scaled_df)

In [12]:
# Loop through each cover type and create a dataset
benign = 2359289
abnormal_type_counts = all_df[TARGET_COLUMN].value_counts()
abnormal_type_dict = abnormal_type_counts.to_dict()
print('abnormal_type_dict', abnormal_type_dict)

benign_ratio = 2  # Adjust this ratio as needed
min_benign_samples = 50  # Ensure at least 50 benign samples are always included

for label, name in TARGET_DICT.items():
    if label == NORMAL_TARGET:
        continue  # Skip the normal target
    
    a, b = 0, 0  # Track abnormal and benign sample counts
    
    # Open the output file for writing
    output_path = os.path.join(OUTPUT_FOLDER, f"{name}.csv")
    with open(output_path, "w") as ths:
        ths.write(','.join(main_labels) + "\n")
        
        # Calculate the number of benign samples based on the fixed ratio
        abnormal_count = abnormal_type_dict[label]
        benign_num = max(min(int(abnormal_count * benign_ratio), benign), min_benign_samples)
        # print(f'Creating {name}.csv with {benign_num} benign samples and {abnormal_count} abnormal samples.')

        # Collect normal (benign) rows and abnormal rows
        benign_rows = []
        abnormal_rows = []

        # Read all_data.csv line by line and collect rows
        with open("all_data.csv", "r") as file:
            for i, line in enumerate(file):
                if i == 0:
                    continue  # Skip the header row
                k = line.strip().split(",")  # Strip newline and split the line
                
                # Collect normal rows
                if int(k[-1]) == NORMAL_TARGET:
                    benign_rows.append(line)
                
                # Collect abnormal rows that match the current label
                elif int(k[-1]) == label:
                    abnormal_rows.append(line)

        # Randomly sample benign rows
        if len(benign_rows) > benign_num:
            benign_rows = random.sample(benign_rows, benign_num)
        else:
            benign_rows = random.sample(benign_rows, len(benign_rows))  # Shuffle if fewer than required

        # Concatenate benign and abnormal rows
        combined_rows = benign_rows + abnormal_rows
        
        # Shuffle the combined rows
        random.shuffle(combined_rows)

        # Write the shuffled rows to the output file
        for row in combined_rows:
            ths.write(row)

        # Print number of rows written
        b = len(benign_rows)
        a = len(abnormal_rows)
        print(f"{name}.csv created with {a + b} rows. ({b} benign and {a} abnormal rows)")

print("All datasets created successfully!")

abnormal_type_dict {2: 283301, 1: 211840, 3: 35754, 7: 20510, 6: 17367, 5: 9493, 4: 2747}
Spruce-Fir.csv created with 495141 rows. (283301 benign and 211840 abnormal rows)
PonderosaPine.csv created with 107262 rows. (71508 benign and 35754 abnormal rows)
CottonwoodWillow.csv created with 8241 rows. (5494 benign and 2747 abnormal rows)
Aspen.csv created with 28479 rows. (18986 benign and 9493 abnormal rows)
DouglasFir.csv created with 52101 rows. (34734 benign and 17367 abnormal rows)
Krummholz.csv created with 61530 rows. (41020 benign and 20510 abnormal rows)
All datasets created successfully!


In [13]:
# tdf = pd.read_csv('csv_files/smurf.csv')
# tdf['attack'].unique()

In [13]:
seconds = time.time()

# CSV files names:
csv_files=os.listdir(OUTPUT_FOLDER)# It creates a list of file names in the "attacks" folder.
print('csv_files',csv_files)

ths = open("importance_list.csv", "w")
feature_importances = {}
ensemble_models = {}
SVMs = {}

def train_models(csv_file):
    print('csv file', csv_file)
    
    X_df, y_df, df = get_anomaly_X_y_from_csv(csv_file, main_labels, TARGET_COLUMN, NORMAL_TARGET, OUTPUT_FOLDER)

    #computing the feature importances
    forest = sk.ensemble.RandomForestRegressor(n_estimators=250,random_state=0)
    forest.fit(X_df, y_df)
    importances = forest.feature_importances_
    label = csv_file.split(".")[0]
    print('importances', importances, label)
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]
    refclasscol=list(df.columns.values)
    impor_bars = pd.DataFrame({'Features':refclasscol[0:20],'importance':importances[0:20]})
    impor_bars = impor_bars.sort_values('importance',ascending=False)
    important_features = impor_bars['Features'].to_list()[:5]
    impor_bars = impor_bars.set_index('Features')
    print('important_features', important_features)
    feature_importances[label] = important_features

    X_scaled_df = X_df.copy()
    X_scaled_df[NUMERICAL_COLUMNS] = scaler.transform(X_scaled_df[NUMERICAL_COLUMNS])

    svm = SVC()
    knn = KNeighborsClassifier(n_neighbors=5)
    decision_tree = DecisionTreeClassifier()
    random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
    logistic_regression = LogisticRegression(max_iter=1000)
    gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)
    
    X_train_class = df.iloc[:, df.columns.get_indexer(important_features)]
    X_train_class_scaled = X_scaled_df.iloc[:, df.columns.get_indexer(important_features)]
    # print('X_train_class', X_train_class)
    y_train_class = y_df
    # print('y_train_class', y_train_class)
    if len(y_train_class) > 0:  # Ensure there are samples for this class
        svm.fit(X_train_class_scaled, y_train_class)
        knn.fit(X_train_class_scaled, y_train_class)
        decision_tree.fit(X_train_class, y_train_class)
        random_forest.fit(X_train_class, y_train_class)
        logistic_regression.fit(X_train_class_scaled, y_train_class)
        gradient_boosting.fit(X_train_class, y_train_class)
    else:
        print(f'no data for {label}')
    SVMs[label] = svm

    # Create an ensemble from the classifiers using VotingClassifier
    voting_clf = VotingClassifier(estimators=[
        ('svm', svm),
        ('knn', knn),
        ('dt', decision_tree),
        ('rf', random_forest),
        ('lr', logistic_regression),
        ('gb', gradient_boosting)
    ], voting='hard')  # Use 'soft' for probability-based voting
    voting_clf.fit(X_train_class_scaled, y_train_class)
    ensemble_models[label] = voting_clf
    
    plt.rcParams['figure.figsize'] = (10, 5)
    impor_bars.plot.bar();
    # #printing the feature importances  
    count=0
    fea_ture=label+"=["
    for i in impor_bars.index:
        fea_ture=fea_ture+"\""+str(i)+"\","
        count+=1
        if count==5:
            fea_ture=fea_ture[0:-1]+"]"
            break     
    print(label,"importance list:")
    print(label,"\n",impor_bars.head(20),"\n\n\n")
    print(fea_ture)
    plt.title(label+" Cover type - Feature Importance")
    plt.ylabel('Importance')
    plt.savefig(os.path.join(FEATURE_FOLDER, label+".pdf"),bbox_inches='tight', format = 'pdf')
    # ths.write((  fea_ture ) )
    plt.tight_layout()
    plt.show()
    print("-----------------------------------------------------------------------------------------------\n\n\n\n")

    return feature_importances, ensemble_models, SVMs

# Parallelize the training across multiple CSV files
results = Parallel(n_jobs=-1)(delayed(train_models)(csv_file) for csv_file in csv_files)

# After parallel execution, collect results
for feature_importances_res, ensemble_models_res, SVMs_res in results:
    feature_importances.update(feature_importances_res)
    ensemble_models.update(ensemble_models_res)
    SVMs.update(SVMs_res)

# Write feature importances to a CSV file after parallel processing
# with open("importance_list.csv", "w") as ths:
#     for label, features in feature_importances.items():
#         fea_ture = f"{label}=[{','.join(f'\"{feat}\"' for feat in features)}]\n"
#         ths.write(fea_ture)
  
print('feature_importances', feature_importances)
print("Total operation time: = ", time.time() - seconds, "seconds")
ths.close()

csv_files ['Krummholz.csv', 'CottonwoodWillow.csv', 'Spruce-Fir.csv', 'Aspen.csv', 'DouglasFir.csv', 'PonderosaPine.csv']
csv file CottonwoodWillow.csv
importances [9.57720729e-01 2.84060065e-04 2.81002815e-04 8.82100601e-04
 8.05705336e-04 1.29646842e-03 2.15048628e-03 3.21291129e-04
 6.21223610e-04 2.88368646e-03 0.00000000e+00 0.00000000e+00
 1.53003889e-03 2.18753532e-02 2.81516371e-04 1.93247192e-04
 1.65347259e-06 4.03700886e-06 4.49395589e-03 1.63395501e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.71585218e-03
 2.92113234e-06 0.00000000e+00 0.00000000e+00 5.81819568e-06
 0.00000000e+00 0.00000000e+00 1.48978410e-05 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00] CottonwoodW

In [14]:
# Initialize variables to track the best model and best performance
best_k = None
best_score = -1
best_model = None

seconds = time.time()

for k in range(3, 21, 2):
    wandb.init(project="pipeline-knn-covertype-dataset", name=str(k)+" neighbours")
    print("k: ", k)
    
    # Step 1: Train KNN to classify
    # knn = KNeighborsClassifier(n_neighbors=k)
    # knn.fit(X_train_scaled_df, y_train_df)
    # knn.fit(X_train_df, y_train_df)
    
    # Step 2: Predict classes for the test set
    # X_test_df = X_test_df[:TEST_COUNT]
    # X_test_scaled_df = X_test_scaled_df[:TEST_COUNT]
    # y_test_df = y_test_df[:TEST_COUNT]

    # Step 1: Initialize KNN classifier with current k
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Step 2: Perform 5-fold cross-validation on the training set
    cv_scores = cross_val_score(knn, X_train_df, y_train_df, cv=5, scoring='accuracy')
    avg_cv_score = cv_scores.mean()  # Get the average cross-validation score
    print(f"Cross-validation score for k={k}: {avg_cv_score}")
    
    # knn_predictions = knn.predict(X_test_scaled)
    # predictions = knn.predict(X_test_df)
    # print('knn_predictions', predictions)
    
    op_time = time.time() - seconds
    print("Total operation time: = ", op_time, "seconds")
    
    # conf_matrix, class_report, acc_score = evaluate(y_test_df, predictions, heading='-----KNN Evaluation-----')
    # wandb_log(conf_matrix, class_report, acc_score)
    
    wandb.log({
        "Cross-validation Score": avg_cv_score,
        "Total operation time": op_time
    })

     # Step 3: Track the best `k` based on the highest cross-validation score
    if avg_cv_score > best_score:
        best_score = avg_cv_score
        best_k = k
        best_model = knn  # Save the KNN model with the best `k`

    wandb.finish()

# After the loop: Train the best KNN model on the entire training set
print(f"The best k is {best_k} with a cross-validation score of {best_score}")
best_model.fit(X_train_df, y_train_df)

# Step 4: Evaluate the best model on the test set
X_test_df = X_test_df[:TEST_COUNT]
y_test_df = y_test_df[:TEST_COUNT]

# Make predictions on the test set
knn_predictions = best_model.predict(X_test_df)
print("Final predictions using the best model:", knn_predictions)

# Evaluate the performance on the test set
conf_matrix, class_report, acc_score = evaluate(y_test_df, knn_predictions, heading="Best KNN Model Evaluation")
print(f"Accuracy on test set with best k={best_k}: {acc_score}")

# Log the final test results to W&B
wandb.init(project="pipeline-knn-covertype-dataset", name="best_model_evaluation")
wandb_log(conf_matrix, class_report, acc_score)
wandb.finish()

k:  3
Cross-validation score for k=3: 0.9658289756892552
Total operation time: =  148.28774857521057 seconds


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Cross-validation Score,▁
Total operation time,▁

0,1
Cross-validation Score,0.96583
Total operation time,148.28775


The best k is 3 with a cross-validation score of 0.9658289756892552
Final predictions using the best model: [1 2 2 2 2 3 2 1 2 2 2 1 5 2 2 1 2 5 3 2 1 2 2 1 6 1 1 2 1 2 1 2 2 3 1 2 2
 2 2 1 2 1 1 2 2 1 2 2 7 2 1 1 6 2 1 1 7 2 2 2 2 1 1 2 2 2 3 2 2 1 5 1 2 3
 1 3 2 1 1 2 2 1 2 2 2 1 1 3 1 2 1 1 2 1 3 3 2 2 1 1 6 6 2 2 1 2 5 6 6 1 2
 1 2 1 1 1 3 1 2 3 1 1 1 1 2 2 1 1 1 2 1 2 2 2 5 1 1 1 2 3 1 2 2 3 1 1 1 2
 2 2 1 7 2 2 1 1 2 2 1 2 1 3 1 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 2 2 1 2 2 2 2
 2 1 2 2 1 1 1 2 2 1 3 3 2 1 1 2 7 1 1 1 1 2 2 2 3 2 5 3 3 2 1 2 1 2 2 3 1
 2 2 3 2 2 2 1 1 2 5 1 1 1 2 2 1 2 2 1 1 2 1 1 6 2 1 2 7 2 1 3 5 1 2 3 2 2
 1 2 2 1 2 2 2 2 1 1 2 1 2 1 1 2 6 1 3 2 7 1 1 2 2 2 2 2 2 7 1 1 1 1 1 2 2
 5 2 2 2 1 6 2 1 1 2 7 1 2 3 2 2 2 2 2 6 1 5 2 2 1 6 2 2 1 1 1 1 2 2 2 7 3
 3 2 1 2 2 2 3 2 2 1 2 1 1 2 1 7 2 1 3 3 5 2 2 1 3 3 2 2 1 2 2 2 2 2 2 1 2
 2 1 2 2 1 2 1 1 2 2 2 1 2 1 3 1 3 2 2 2 6 2 3 2 2 1 2 2 6 1 2 1 2 2 2 2 1
 2 2 7 2 2 3 1 2 1 1 2 2 2 1 7 1 2 2 1 6 2 2 1 1 2 2 2 6 2 2 5 1 6 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011120529222211593, max=1.0…

VBox(children=(Label(value='0.005 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.963471102726805, max=1.0)…

0,1
Accuracy Score,▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.984
f1-score_avg,0.98297
precision_avg,0.98232
recall_avg,0.984


csv file Spruce-Fir.csv
importances [3.27792519e-01 2.79286653e-02 1.63415314e-02 6.75524954e-02
 4.69016726e-02 1.58180294e-01 2.81521628e-02 3.90647018e-02
 2.19325803e-02 1.57215058e-01 8.00204582e-03 5.00552027e-03
 1.04513981e-02 1.67397951e-07 0.00000000e+00 1.42215969e-05
 1.47483326e-05 1.93429342e-04 0.00000000e+00 0.00000000e+00
 6.95393215e-07 9.52867953e-05 1.83302616e-04 8.15366337e-04
 3.42573691e-04 1.53201802e-03 3.25231064e-03 0.00000000e+00
 0.00000000e+00 1.18171505e-03 2.42002225e-04 4.89926975e-06
 1.24509056e-03 3.00706212e-03 7.68923201e-04 1.23347759e-02
 1.08121855e-02 7.57167704e-03 1.39963583e-05 1.38922156e-04
 1.55518227e-03 1.18679619e-04 1.02146627e-02 2.72535698e-03
 6.58068874e-03 1.11305597e-02 6.67661334e-03 7.52689844e-04
 5.81856816e-06 1.06369486e-04 0.00000000e+00 6.02698132e-04
 7.64023417e-04 4.80643156e-04] Spruce-Fir
important_features ['Elevation', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_T

In [20]:
print(len(X_test_scaled_df))

116203


In [15]:
print(SVMs)

{'Krummholz': SVC(), 'CottonwoodWillow': SVC(), 'Spruce-Fir': SVC(), 'Aspen': SVC(), 'DouglasFir': SVC(), 'PonderosaPine': SVC()}


In [21]:
svm_predictions = []
seconds = time.time()

X_test_df = X_test_df[:TEST_COUNT]
X_test_scaled_df = X_test_scaled_df[:TEST_COUNT]
y_test_df = y_test_df[:TEST_COUNT]

for i in range(len(X_test_scaled_df)):
    test_instance_df = X_test_scaled_df.iloc[i].to_frame().T
    if knn_predictions[i] == NORMAL_TARGET:
        svm_predictions.append((0, NORMAL_TARGET, y_test_df.iloc[i]))
        continue
    predicted_label = TARGET_DICT[knn_predictions[i]]
    print('predicted_label', predicted_label)
    selected_features = feature_importances[predicted_label]
    # print('selected_features', selected_features)
    svm_model = SVMs[predicted_label]
    
    # Select features for the SVM
    test_instance_selected_df = test_instance_df[selected_features]
    svm_prediction = svm_model.predict(test_instance_selected_df)
    print('svm prediction: ', svm_prediction, 'knn prediction: ', INV_TARGET_DICT[predicted_label], 'actual value: ', y_test_df.iloc[i])
    svm_predictions.append((svm_prediction, INV_TARGET_DICT[predicted_label], y_test_df.iloc[i]))

print("Total operation time: = ", time.time() - seconds, "seconds")

predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:  1
predicted_label PonderosaPine
svm prediction:  [0] knn prediction:  3 actual value:  3
predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:  1
predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:  1
predicted_label Aspen
svm prediction:  [0] knn prediction:  5 actual value:  5
predicted_label Spruce-Fir
svm prediction:  [1] knn prediction:  1 actual value:  1
predicted_label Aspen
svm prediction:  [1] knn prediction:  5 actual value:  5
predicted_label PonderosaPine
svm prediction:  [0] knn prediction:  3 actual value:  3
predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:  1
predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:  1
predicted_label DouglasFir
svm prediction:  [0] knn prediction:  6 actual value:  6
predicted_label Spruce-Fir
svm prediction:  [0] knn prediction:  1 actual value:

In [22]:
wandb.init(project="pipeline-covertype-dataset", name="anomalies-svm-only")

# Now evaluate SVM predictions only for the anomalies detected by KNN
# Create a mask for test instances that KNN classified as anomalies
anomaly_mask = knn_predictions != NORMAL_TARGET  # Assuming normal_target is your normal class
print('anomaly_mask', len(anomaly_mask))

# Get true labels and predictions for anomalies
svm_predictions_actual = [t[1] for t in svm_predictions]
print(svm_predictions_actual)
# svm_predictions_actual = np.array(svm_predictions_actual)
svm_predictions_actual_df = pd.DataFrame(svm_predictions_actual, columns=[TARGET_COLUMN])
y_test_anomalies_df = y_test_df[anomaly_mask]
svm_predictions_anomalies_df = svm_predictions_actual_df[anomaly_mask]
print('Lengths: ', len(y_test_anomalies_df), len(svm_predictions_anomalies_df))

# Evaluate SVM only on the anomalies
conf_matrix, class_report, acc_score = evaluate(y_test_anomalies_df, svm_predictions_anomalies_df, heading='SVM Evaluation (for anomalies)')
wandb_log(conf_matrix, class_report, acc_score)
wandb.finish()

# Evaluate SVM on all test data
wandb.init(project="pipeline-covertype-dataset", name="overall-svm-only")
conf_matrix, class_report, acc_score = evaluate(y_test_df, svm_predictions_actual_df, heading='SVM Evaluation (for all)')
wandb_log(conf_matrix, class_report, acc_score)
wandb.finish()

anomaly_mask 500
[1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 1, 5, 2, 2, 1, 2, 5, 3, 2, 1, 2, 2, 1, 6, 1, 1, 2, 1, 2, 1, 2, 2, 3, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 7, 2, 1, 1, 6, 2, 1, 1, 7, 2, 2, 2, 2, 1, 1, 2, 2, 2, 3, 2, 2, 1, 5, 1, 2, 3, 1, 3, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 2, 1, 3, 3, 2, 2, 1, 1, 6, 6, 2, 2, 1, 2, 5, 6, 6, 1, 2, 1, 2, 1, 1, 1, 3, 1, 2, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 5, 1, 1, 1, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 2, 1, 7, 2, 2, 1, 1, 2, 2, 1, 2, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 3, 3, 2, 1, 1, 2, 7, 1, 1, 1, 1, 2, 2, 2, 3, 2, 5, 3, 3, 2, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 2, 2, 1, 1, 2, 5, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 6, 2, 1, 2, 7, 2, 1, 3, 5, 1, 2, 3, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 6, 1, 3, 2, 7, 1, 1, 2, 2, 2, 2, 2, 2, 7, 1, 1, 1, 1, 1, 2, 2, 5, 2, 2, 2, 1, 6, 2, 1, 1, 2, 7, 1, 2, 3, 2, 2, 2, 2, 2, 6, 1, 5, 2, 2, 1, 6, 2, 2, 1, 1, 1, 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.9630408127517954, max=1.0…

0,1
Accuracy Score,▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.97719
f1-score_avg,0.97164
precision_avg,0.96643
recall_avg,0.97719


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111370556666063, max=1.0)…

SVM Evaluation (for all)
Confusion Matrix:
[[175   1   0   0   0   0   0]
 [  2 235   0   0   0   0   0]
 [  0   0  36   0   0   1   0]
 [  0   0   0   0   0   1   0]
 [  0   1   1   0  12   0   0]
 [  0   0   1   0   0  19   0]
 [  0   0   0   0   0   0  15]]

Classification Report:
{'1': {'precision': 0.9887005649717514, 'recall': 0.9943181818181818, 'f1-score': 0.9915014164305949, 'support': 176.0}, '2': {'precision': 0.9915611814345991, 'recall': 0.9915611814345991, 'f1-score': 0.9915611814345991, 'support': 237.0}, '3': {'precision': 0.9473684210526315, 'recall': 0.972972972972973, 'f1-score': 0.9599999999999999, 'support': 37.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, '5': {'precision': 1.0, 'recall': 0.8571428571428571, 'f1-score': 0.923076923076923, 'support': 14.0}, '6': {'precision': 0.9047619047619048, 'recall': 0.95, 'f1-score': 0.9268292682926829, 'support': 20.0}, '7': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 15.0}, '

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Accuracy Score,▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.984
f1-score_avg,0.98297
precision_avg,0.98232
recall_avg,0.984


In [23]:
ensemble_predictions = []
seconds = time.time()

for i in range(len(X_test_scaled_df)):
    test_instance_df = X_test_scaled_df.iloc[i].to_frame().T
    if knn_predictions[i] == NORMAL_TARGET:
        ensemble_predictions.append((0, NORMAL_TARGET, y_test_df.iloc[i]))
        continue
    predicted_label = TARGET_DICT[knn_predictions[i]]
    # print('predicted_label', predicted_label)
    selected_features = feature_importances[predicted_label]
    # print('selected_features', selected_features)
    model = ensemble_models[predicted_label]
    
    # Select features for the ensemble
    test_instance_selected_df = test_instance_df[selected_features]
    ensemble_prediction = model.predict(test_instance_selected_df)
    print('ensemble prediction: ', ensemble_prediction, 'knn prediction: ', INV_TARGET_DICT[predicted_label], 'actual value: ', y_test_df.iloc[i])
    ensemble_predictions.append((ensemble_prediction, INV_TARGET_DICT[predicted_label], y_test_df.iloc[i]))

print("Total operation time: = ", time.time() - seconds, "seconds")

ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [0] knn prediction:  3 actual value:  3
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  5 actual value:  5
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  5 actual value:  5
ensemble prediction:  [0] knn prediction:  3 actual value:  3
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [0] knn prediction:  6 actual value:  6
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [1] knn prediction:  1 actual value:  1
ensemble prediction:  [0] knn prediction:  3 actual value:  3
ensemble

In [24]:
wandb.init(project="pipeline-covertype-dataset", name="anomalies-ensemble")

# Now evaluate ensemble predictions only for the anomalies detected by KNN
# Create a mask for test instances that KNN classified as anomalies
anomaly_mask = knn_predictions != NORMAL_TARGET  # Assuming normal_target is your normal class
print('anomaly_mask', len(anomaly_mask))

# Get true labels and predictions for anomalies
ensemble_predictions_actual = [t[1] for t in ensemble_predictions]
print(ensemble_predictions_actual)
ensemble_predictions_actual_df = pd.DataFrame(ensemble_predictions_actual, columns=[TARGET_COLUMN])
y_test_anomalies_df = y_test_df[anomaly_mask]
ensemble_predictions_anomalies_df = ensemble_predictions_actual_df[anomaly_mask]
print('Lengths: ', len(y_test_anomalies_df), len(ensemble_predictions_anomalies_df))

# Evaluate ensemble only on the anomalies
conf_matrix, class_report, acc_score = evaluate(y_test_anomalies_df, ensemble_predictions_anomalies_df, heading='Ensemble Evaluation (for anomalies)')
wandb_log(conf_matrix, class_report, acc_score)
wandb.finish()

# Evaluate ensemble on all test data
wandb.init(project="pipeline-covertype-dataset", name="overall-ensemble")
conf_matrix, class_report, acc_score = evaluate(y_test_df, ensemble_predictions_actual_df, heading='Ensemble Evaluation (for all)')
wandb_log(conf_matrix, class_report, acc_score)
wandb.finish()

anomaly_mask 500
[1, 2, 2, 2, 2, 3, 2, 1, 2, 2, 2, 1, 5, 2, 2, 1, 2, 5, 3, 2, 1, 2, 2, 1, 6, 1, 1, 2, 1, 2, 1, 2, 2, 3, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 7, 2, 1, 1, 6, 2, 1, 1, 7, 2, 2, 2, 2, 1, 1, 2, 2, 2, 3, 2, 2, 1, 5, 1, 2, 3, 1, 3, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 2, 1, 3, 3, 2, 2, 1, 1, 6, 6, 2, 2, 1, 2, 5, 6, 6, 1, 2, 1, 2, 1, 1, 1, 3, 1, 2, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 5, 1, 1, 1, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 2, 1, 7, 2, 2, 1, 1, 2, 2, 1, 2, 1, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 3, 3, 2, 1, 1, 2, 7, 1, 1, 1, 1, 2, 2, 2, 3, 2, 5, 3, 3, 2, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 2, 2, 1, 1, 2, 5, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 6, 2, 1, 2, 7, 2, 1, 3, 5, 1, 2, 3, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 6, 1, 3, 2, 7, 1, 1, 2, 2, 2, 2, 2, 2, 7, 1, 1, 1, 1, 1, 2, 2, 5, 2, 2, 2, 1, 6, 2, 1, 1, 2, 7, 1, 2, 3, 2, 2, 2, 2, 2, 6, 1, 5, 2, 2, 1, 6, 2, 2, 1, 1, 1, 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.9630602240896359, max=1.0…

0,1
Accuracy Score,▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.97719
f1-score_avg,0.97164
precision_avg,0.96643
recall_avg,0.97719


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112557288894702, max=1.0…

Ensemble Evaluation (for all)
Confusion Matrix:
[[175   1   0   0   0   0   0]
 [  2 235   0   0   0   0   0]
 [  0   0  36   0   0   1   0]
 [  0   0   0   0   0   1   0]
 [  0   1   1   0  12   0   0]
 [  0   0   1   0   0  19   0]
 [  0   0   0   0   0   0  15]]

Classification Report:
{'1': {'precision': 0.9887005649717514, 'recall': 0.9943181818181818, 'f1-score': 0.9915014164305949, 'support': 176.0}, '2': {'precision': 0.9915611814345991, 'recall': 0.9915611814345991, 'f1-score': 0.9915611814345991, 'support': 237.0}, '3': {'precision': 0.9473684210526315, 'recall': 0.972972972972973, 'f1-score': 0.9599999999999999, 'support': 37.0}, '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1.0}, '5': {'precision': 1.0, 'recall': 0.8571428571428571, 'f1-score': 0.923076923076923, 'support': 14.0}, '6': {'precision': 0.9047619047619048, 'recall': 0.95, 'f1-score': 0.9268292682926829, 'support': 20.0}, '7': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 15.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.005 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.9634271978021978, max=1.0…

0,1
Accuracy Score,▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.984
f1-score_avg,0.98297
precision_avg,0.98232
recall_avg,0.984
