# Testing Environment
This notebook is used for experimenting code blocks and performing tuning operations for models. It currently contains successful trials in all cells.

## Libraries

In [12]:
# Libraries and tools                                                        Purpose
import os #__________________________________________________________________Operating system manipulation
import sys #_________________________________________________________________System-specific parameters
import numpy as np #_________________________________________________________Numerical computations
import pandas as pd #________________________________________________________Data structures
from datetime import datetime #____________________________________________________________Data involving time
import textwrap #____________________________________________________________Line formatting
import matplotlib.pyplot as plt #____________________________________________Statistic visualizations
import seaborn as sns #______________________________________________________"
import joblib #______________________________________________________________Exporting ML models
from sklearn.cluster import KMeans #_________________________________________Clustering modeling
from sklearn.linear_model import LinearRegression, LogisticRegression #______Base ML models
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier #_____Tree models
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier #_"
from xgboost import XGBRegressor, XGBClassifier #____________________________Boost models
from lightgbm import LGBMRegressor, LGBMClassifier #_________________________"
from catboost import CatBoostRegressor, CatBoostClassifier #_________________"
from sklearn.pipeline import Pipeline #______________________________________Pipeline preprocessing
from sklearn.compose import ColumnTransformer #______________________________"
from sklearn.impute import SimpleImputer #___________________________________"
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, #___________Preprocessing data scalers
    OneHotEncoder, LabelEncoder, PolynomialFeatures) #_______________________Feature engineering
from scipy import stats #____________________________________________________Probability and stat tests
from sklearn.model_selection import (train_test_split, cross_val_score, #____Model training & validation
    GridSearchCV, RandomizedSearchCV, StratifiedKFold) #_____________________"
from sklearn.metrics import (log_loss, r2_score, mean_squared_error, #_______Model evaluation
    root_mean_squared_error, accuracy_score, f1_score, precision_score, #____"
    recall_score, confusion_matrix, classification_report) #_________________"

In [13]:
def load_models():
    base_path = 'ml_models/'
    return {
        "Logistic Regression": joblib.load(os.path.join(base_path, 'logistic_regression.pkl')),
        "Decision Tree": joblib.load(os.path.join(base_path, 'decision_tree.pkl')),
        "Random Forest": joblib.load(os.path.join(base_path, 'random_forest.pkl')),
        "XG Boosting": joblib.load(os.path.join(base_path, 'xgboost.pkl')),
        "LightGBM": joblib.load(os.path.join(base_path, 'lightgbm.pkl')),
        "CatBoost": joblib.load(os.path.join(base_path, 'catboost.pkl'))
    }


## Import and Inspect Data

In [15]:
#------------------------
# Load and inspect data
#------------------------

df = pd.read_csv("data/pdw_dataset.csv")
df.info()
print('\n\n\nEXAMPLE DATA:')
display(df.sample())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11997 entries, 0 to 11996
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Timestamp       11997 non-null  object 
 1   Emitter_ID      11997 non-null  int64  
 2   Radar_Function  11997 non-null  object 
 3   RF_MHz          11997 non-null  object 
 4   PW_us           11947 non-null  float64
 5   PRI_us          11946 non-null  float64
 6   Amplitude_dB    11945 non-null  float64
 7   DOA_deg         11951 non-null  float64
 8   Location_MGRS   11997 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 843.7+ KB



EXAMPLE DATA:


Unnamed: 0,Timestamp,Emitter_ID,Radar_Function,RF_MHz,PW_us,PRI_us,Amplitude_dB,DOA_deg,Location_MGRS
7512,2025-12-13 16:56:57,7,Target Illumination,10458.54412226765,3.293423,56.033804,2.416301,216.378221,51STV518067


In [16]:
cols = ['RF_MHz', 'PW_us', 'PRI_us', 'Amplitude_dB', 'DOA_deg']
df[['RF_MHz', 'PW_us', 'PRI_us', 'Amplitude_dB', 'DOA_deg']].describe()

Unnamed: 0,PW_us,PRI_us,Amplitude_dB,DOA_deg
count,11947.0,11946.0,11945.0,11951.0
mean,2.200409,342.179367,11.433984,161.387265
std,1.575861,246.304471,6.519501,112.952144
min,0.01,36.389272,-9.927856,-42.229623
25%,1.018049,115.536763,6.958043,53.166461
50%,1.840423,352.676286,11.452169,175.61049
75%,3.131354,450.17077,15.952548,266.353439
max,6.744222,857.814994,34.055909,396.585883


In [17]:
#------------------------
# Data Cleaning
#------------------------

def clean_df(df):
    """
    First function to be called to clean the dataframe by converting feature columns to numeric, converting timestamps to datetime and reordering the columns.

    Parameters
    ----------
    df : pandas.DataFrame -> The dataframe to be cleaned.

    Returns
    -------
    pandas.DataFrame ->A cleaned version of the dataframe.
    """
    # Variables
    clean_df = df.copy()
    feature_cols = ['RF_MHz', 'PW_us', 'PRI_us', 'Amplitude_dB']
    
    # Dtype Conversions
    for col in feature_cols:
        clean_df[col] = pd.to_numeric(clean_df[col], errors='coerce')  # Convert strings or bad values to NaN
    
    date_time = pd.to_datetime(clean_df['Timestamp'])
    clean_df['Date'] = date_time.dt.date   # Convert to datetime
    clean_df['Time'] = date_time.dt.time   # Convert to datetime
    clean_df['DDHHMMZ'] = date_time.dt.strftime('%d%H%MZ')

    # Remove duplicate rows and missing data
    clean_df = clean_df.dropna()
    clean_df = clean_df.drop_duplicates(keep='last')

   # Generate Report
    julian_day = datetime.now().strftime('%j')
    dt_now = datetime.now().strftime('%d%H%M')

    header = 'EXERCISE EXERCISE EXERCISE'
    classification = 'U N C L A S S I F I E D'
    op_line = 'OPER/GALVANIZE//'
    msgid = f'MSGID/FAKEREP/BC/{julian_day}//'
    clean_df['SOI'] = clean_df.apply(lambda x: (f'SOI/-/{x['DDHHMMZ']}/{dt_now}Z/SYS_NOT/EMITTER_{x['Emitter_ID']})//'), axis=1)
    clean_df['NARR'] = clean_df.apply(lambda x: (f"""NARR: ON {x['Date']} {x['Radar_Function'].upper()} SYSTEM (EMITTER_{x['Emitter_ID']}) WAS OBSERVED IVO {x['Location_MGRS']}."""), axis=1)
    clean_df['PARAMS'] = clean_df.apply(lambda x: (f"""PRMS/FREQ:{x['RF_MHz']:.4f} MHZ/PRI:{x['PRI_us']:010.3f}/PW:{x['PW_us']:.3f}/AMP:{x['Amplitude_dB']:.3f}DB/\nDOA:{x['DOA_deg']:.3f} DEGREES"""), axis=1)
    ampl_line = 'AMPL/AUTOGENERATED WITH MACHINE LEARNING//'
    
    def char_limit(row):
        limit = textwrap.fill(row, width=69)
        return limit

    clean_df['NARR'] = clean_df['NARR'].apply(char_limit)

    clean_df['Report'] = clean_df.apply(lambda x: (f"""{header}\n{classification}\n{op_line}\n{msgid}\n{x['SOI']}\n{x['NARR'].replace(' / ', '/')}\n{x['PARAMS']}\n{ampl_line}"""), axis=1)

    # Reorder
    clean_df = clean_df[['Date', 'Time', 'Emitter_ID', 'Radar_Function', 'RF_MHz', 'PW_us', 'PRI_us', 'Amplitude_dB', 'DOA_deg', 'Location_MGRS', 'Report']] # Reorder
    clean_df = clean_df.sort_values(by='Date').reset_index(drop=True)  # Arrange by date

    return clean_df

In [18]:
#------------------------
# Apply Cleaning Functions
#------------------------

new_df = clean_df(df)
new_df.info()
display(new_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11643 entries, 0 to 11642
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            11643 non-null  object 
 1   Time            11643 non-null  object 
 2   Emitter_ID      11643 non-null  int64  
 3   Radar_Function  11643 non-null  object 
 4   RF_MHz          11643 non-null  float64
 5   PW_us           11643 non-null  float64
 6   PRI_us          11643 non-null  float64
 7   Amplitude_dB    11643 non-null  float64
 8   DOA_deg         11643 non-null  float64
 9   Location_MGRS   11643 non-null  object 
 10  Report          11643 non-null  object 
dtypes: float64(5), int64(1), object(5)
memory usage: 1000.7+ KB


Unnamed: 0,Date,Time,Emitter_ID,Radar_Function,RF_MHz,PW_us,PRI_us,Amplitude_dB,DOA_deg,Location_MGRS,Report
0,2025-11-23,15:46:27,8,Medium-Range Search / Ground Mapping,9755.074702,1.950935,446.24279,13.020642,93.62926,51RWL215712,EXERCISE EXERCISE EXERCISE\nU N C L A S S I F ...
1,2025-11-23,15:40:12,8,Medium-Range Search / Ground Mapping,9788.016668,2.263802,447.77872,20.299857,93.80828,51RWL215712,EXERCISE EXERCISE EXERCISE\nU N C L A S S I F ...
2,2025-11-23,15:36:20,8,Medium-Range Search / Ground Mapping,9715.207387,3.137078,343.786447,10.824631,82.818948,51RWL215712,EXERCISE EXERCISE EXERCISE\nU N C L A S S I F ...
3,2025-11-23,15:50:09,8,Medium-Range Search / Ground Mapping,9724.553986,1.979538,402.121175,23.705022,108.880139,51RWL215712,EXERCISE EXERCISE EXERCISE\nU N C L A S S I F ...
4,2025-11-23,15:16:33,8,Medium-Range Search / Ground Mapping,9765.702453,1.667303,353.210397,18.47169,100.283732,51RWL215712,EXERCISE EXERCISE EXERCISE\nU N C L A S S I F ...


In [20]:
print(new_df['Report'][0])

EXERCISE EXERCISE EXERCISE
U N C L A S S I F I E D
OPER/GALVANIZE//
MSGID/FAKEREP/BC/350//
SOI/-/231546Z/162305Z/SYS_NOT/EMITTER_8)//
NARR: ON 2025-11-23 MEDIUM-RANGE SEARCH/GROUND MAPPING SYSTEM
(EMITTER_8) WAS OBSERVED IVO 51RWL215712.
PRMS/FREQ:9755.0747 MHZ/PRI:000446.243/PW:1.951/AMP:13.021DB/
DOA:93.629 DEGREES
AMPL/AUTOGENERATED WITH MACHINE LEARNING//


## Data Prep for ML

In [21]:
#------------------------
# Split data into features and target
#------------------------

X = new_df.drop(columns=['Date', 'Time', 'Emitter_ID', 'Radar_Function', 'DOA_deg', 'Location_MGRS', 'Report'], axis=1)
y = new_df['Emitter_ID']

In [22]:
#------------------------
# Classify features
#------------------------

numeric_features = X.select_dtypes(include=['float', 'int']).columns
categorical_features = X.select_dtypes(include='object').columns

#------------------------
# Split into train and test sets
#------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.2, random_state=42)

# Adjustments for XGBoost and LightGBM models because they require 0-indexing
if y.min() == 1:        
    y_train_adj = y_train - 1
    y_test_adj = y_test - 1
else:
    y_train_adj = y_train.copy()
    y_test_adj = y_test.copy()

## Pipeline

In [28]:
#------------------------------
# Data Transformers
# -----------------------------
  
num_xformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())])
# PolynomialFeatures for Logistic Regression ONLY
num_xformer_poly = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', MinMaxScaler())])  

cat_xformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])
# Create pipelines for CatBoost without OneHotEncoder
cat_xformer_cb = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing"))])

#------------------------------
# Preprocessors
# -----------------------------

preprocessor = ColumnTransformer(transformers=[
        ("num", num_xformer, numeric_features),
        ("cat", cat_xformer, categorical_features)], 
        remainder="drop")
preprocessor.set_output(transform="pandas") # to retain column names

preprocessor_poly = ColumnTransformer(transformers=[
        ("num", num_xformer_poly, numeric_features),
        ("cat", cat_xformer, categorical_features)], 
        remainder="drop")
preprocessor_poly.set_output(transform="pandas") # to retain column names

# Preprocessing for CatBoost
cb_preprocessor = ColumnTransformer(transformers=[
        ("num", num_xformer, numeric_features),
        ("cat", cat_xformer_cb, categorical_features)], 
        remainder="drop")
cb_preprocessor.set_output(transform="pandas") # to retain column names

#------------------------------
# Model Pipes
# -----------------------------

logistic_pipe = Pipeline(steps=[
        ("preprocess", preprocessor_poly),
        ("model", LogisticRegression(max_iter=1000, random_state=42))])
    
tree_pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", DecisionTreeClassifier(random_state=42))])

forest_pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(max_depth=12, min_samples_split=7, n_estimators=63, random_state=42))])

xgb_pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(random_state=42))])

lgbm_pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", LGBMClassifier(verbose=-1, random_state=42))])

cat_pipe = Pipeline(steps=[
        ("preprocess", cb_preprocessor),
        ("model", CatBoostClassifier(verbose=0, random_state=42))])

## Model Training

In [29]:
#------------------------------
# Logistic Regression Pipe
# -----------------------------

logistic_pipe.fit(X_train, y_train)
y_pred = logistic_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%')

F1 Score: 0.9405
Accuracy Score: 94.55%


In [30]:
#------------------------------
# Decision Tree Pipe
# -----------------------------

tree_pipe.fit(X_train, y_train)
y_pred = tree_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%')

F1 Score: 0.9707
Accuracy Score: 97.04%


In [31]:
#------------------------------
# Random Forest Pipe
# -----------------------------

forest_pipe.fit(X_train, y_train)
y_pred = forest_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%')

F1 Score: 0.9811
Accuracy Score: 98.11%


In [32]:
#------------------------------
# XGBoost Pipe
# -----------------------------

xgb_pipe.fit(X_train, y_train_adj)
y_pred = xgb_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test_adj, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test_adj, y_pred)*100:.2f}%')

F1 Score: 0.9689
Accuracy Score: 96.87%


In [33]:
#------------------------------
# LightGBM Pipe
# -----------------------------

lgbm_pipe.fit(X_train, y_train_adj)
y_pred = lgbm_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test_adj, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test_adj, y_pred)*100:.2f}%')

F1 Score: 0.9703
Accuracy Score: 96.99%


In [34]:
#------------------------------
# CatBoost Pipe
# -----------------------------

cat_pipe.fit(X_train, y_train)
y_pred = cat_pipe.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted"):.4f}')
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)*100:.2f}%')

F1 Score: 0.9717
Accuracy Score: 97.17%


In [35]:
#------------------------------
# All Pipes
# -----------------------------

all_models = {
    'Logistic Regression': logistic_pipe,
    'Decision Tree': tree_pipe,
    'Random Forest': forest_pipe,
    'XGBoost': xgb_pipe,
    'LightGBM': lgbm_pipe,
    'CatBoost': cat_pipe
}

for name, model in all_models.items():
    
    if model == xgb_pipe or model == lgbm_pipe:
        y_true = y_test_adj
        model.fit(X_train, y_train_adj)
    else:
        y_true = y_test
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"{name} score: {f1 * 100:.2f}%")

Logistic Regression score: 94.05%
Decision Tree score: 97.07%
Random Forest score: 98.11%
XGBoost score: 96.89%
LightGBM score: 97.03%
CatBoost score: 97.17%


## Hyper Parameter Tuning

In [36]:
#------------------------------
# Parameter Dictionary for GridSearchCV
# -----------------------------

params_gridscv = {
   "Logistic Regression": {
       'pipe': logistic_pipe,
       'grid': {
            'model__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], # Regularization - penalized for large coefficients
            'model__max_iter': [1000]                   # Increased number of passes for solver to run
    }},

    "Decision Tree": {
        'pipe': tree_pipe,
        'grid': {
            'model__max_depth': [2, 3, 5, 7, 10, None], # Depth of tree decisions
            'model__min_samples_split': [2, 3, 5, 7],   # Minimum num of samples required at each node
            'model__min_samples_leaf': [2, 5, 7, 10]    # Minimum num of samples required at each final grouping
    }},

    "Random Forest": {
        'pipe': forest_pipe,
        'grid': {    
            'model__n_estimators': [100, 200, 400],  # Number of trees
            'model__max_depth': [10, 15, None],      # Depth of tree decisions
            'model__min_samples_split': [2, 3, 5, 7] # minimum num of samples required at each node
    }}
}

#------------------------------
# Parameter Dictionary for RandomizedSearchCV
# -----------------------------

params_randomscv = {
    'XGBoost': {
        'pipe': xgb_pipe,
        'dist': {
            'model__learning_rate': stats.uniform(.03, .2),     # Range 0.03 to 0.23
            'model__n_estimators': stats.randint(100, 1000),    # Number of boosting stages
            'model__max_depth': stats.randint(3, 8)             # Tree complexity
    }},

    'LightGBM': {
        'pipe': lgbm_pipe,
        'dist': {
            'model__learning_rate': stats.uniform(.03, .2),     # Range 0.03 to 0.23
            'model__n_estimators': stats.randint(100, 500),     # Number of boosting stages
            'model__num_leaves': stats.randint(10, 100)         # Tree complexity

    }},

    'CatBoost': {
        'pipe': cat_pipe,
        'dist': {
            'model__learning_rate': stats.uniform(.03, .2),     # Range 0.03 to 0.23
            'model__iterations': stats.randint(100, 1000),      # Number of instance
            'model__max_depth': stats.randint(3, 10),           # Complexity
    }}
    
}

In [37]:
#------------------------------
# Running GridSearchCV on Logistic Regression, Decision Tree, and Random Forest
# -----------------------------

print("Starting GridSearchCV for selected models...")
results = {}

for name, params in params_gridscv.items():
    print(f"\nTuning {name}...")
    
    grid_scv = GridSearchCV(
        estimator=params['pipe'], 
        param_grid=params['grid'], 
        scoring='f1_weighted', # Metric: F1 Score Weighted
        cv=5,                 # 5-fold cross-validation
        n_jobs=-1,            # Use all available cores
        verbose=1             # Print progress
    )
    
    grid_scv.fit(X_train, y_train)
    y_preds = grid_scv.predict(X_test)
    
    prec = precision_score(y_test, y_preds, average='weighted')
    recall = recall_score(y_test, y_preds, average='weighted')
    f1 = f1_score(y_test, y_preds, average='weighted')

    # Store and print results
    results[name] = {
        'best_score': grid_scv.best_score_,
        'best_params': grid_scv.best_params_
    }
    print(f' -> Precision = {prec:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}')

print("\nGrudSearchCV complete.")

Starting GridSearchCV for selected models...

Tuning Logistic Regression...
Fitting 5 folds for each of 7 candidates, totalling 35 fits
 -> Precision = 0.9655, Recall = 0.9665, F1 = 0.9640

Tuning Decision Tree...
Fitting 5 folds for each of 96 candidates, totalling 480 fits
 -> Precision = 0.9737, Recall = 0.9721, F1 = 0.9726

Tuning Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
 -> Precision = 0.9776, Recall = 0.9772, F1 = 0.9774

GrudSearchCV complete.


In [38]:
#------------------------------
# GridSearchCV Results
# -----------------------------

for name, result in results.items():
    print(f"{name} best score: {result['best_score']:.4f}")
    print(f"{name} best params: {result['best_params']}\n") 

Logistic Regression best score: 0.9642
Logistic Regression best params: {'model__C': 100, 'model__max_iter': 1000}

Decision Tree best score: 0.9723
Decision Tree best params: {'model__max_depth': None, 'model__min_samples_leaf': 10, 'model__min_samples_split': 2}

Random Forest best score: 0.9765
Random Forest best params: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 100}



In [31]:
#------------------------------
# Running RandomSearchCV on XGBoost, LightGBM, and CatBoost
# -----------------------------

print("Starting RandomSearchCV for boost models...")
results2 = {}

for name, params in params_randomscv.items():
    print(f"\nTuning {name}...")
    
    rando_scv = RandomizedSearchCV(
        estimator=params['pipe'], 
        param_distributions=params['dist'], 
        scoring='f1_weighted', # Metric: F1 Score Weighted
        cv=3,                 # 5-fold cross-validation
        n_jobs=-1,            # Use all available cores
        verbose=1             # Print progress
    )
    
    if name == 'XGBoost' or name == 'LightGBM':
        rando_scv.fit(X_train, y_train_adj)
        y_true = y_test_adj
    else:
        rando_scv.fit(X_train, y_train)
        y_true = y_test

    y_preds2 = rando_scv.predict(X_test)
    
    prec2 = precision_score(y_test, y_preds, average='weighted')
    recall2 = recall_score(y_test, y_preds, average='weighted')
    f1_2 = f1_score(y_test, y_preds, average='weighted')

    # Store and print results
    results2[name] = {
        'best_score': rando_scv.best_score_,
        'best_params': rando_scv.best_params_
    }
    print(f' -> Precision = {prec2:.4f}, Recall = {recall2:.4f}, F1 = {f1_2:.4f}')

print("\nRandomSearchCV complete.")

Starting RandomSearchCV for boost models...

Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
 -> Precision = 0.9776, Recall = 0.9772, F1 = 0.9774

Tuning LightGBM...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
 -> Precision = 0.9776, Recall = 0.9772, F1 = 0.9774

Tuning CatBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
 -> Precision = 0.9776, Recall = 0.9772, F1 = 0.9774


In [32]:
#------------------------------
# RandomSearchCV Results
# -----------------------------

for name, result in results2.items():
    print(f"{name} best score: {result['best_score']:.4f}")
    print(f"{name} best params: {result['best_params']}\n") 

XGBoost best score: 0.9747
XGBoost best params: {'model__learning_rate': np.float64(0.06496044215937306), 'model__max_depth': 3, 'model__n_estimators': 238}

LightGBM best score: 0.9724
LightGBM best params: {'model__learning_rate': np.float64(0.1501089469576768), 'model__n_estimators': 106, 'model__num_leaves': 74}

CatBoost best score: 0.9765
CatBoost best params: {'model__iterations': 775, 'model__learning_rate': np.float64(0.030851196164042166), 'model__max_depth': 3}



### Mega-Tune!
Random Forest outperformed the other models in both the base testing and cross validation. Let's perform one more GridSearchCV with just Random Forest using a larger array of hyperparameters to get it just right.

In [71]:
#------------------------------
# Mega GridSearchCV
# -----------------------------

enhanced_gridscv = {
    "Decision Tree": {
        'pipe': tree_pipe,
        'grid': {    
            'model__min_samples_leaf': list(range(0, 101, 10)),  # Number of trees
            'model__max_depth': list(range(1, 16, 2)),        # Depth of tree decisions
            'model__min_samples_split': list(range(2, 10, 1)) # minimum num of samples required at each node
    }}
}

In [72]:
# #------------------------------
# # Running Mega GridSearchCV on Random Forest
# # -----------------------------

print("Starting comprehensive GridSearchCV...")
results3 = {}

for name, params in enhanced_gridscv.items():
    print(f"\nTuning {name}...")
    
    grid_scv2 = GridSearchCV(
        estimator=params['pipe'], 
        param_grid=params['grid'], 
        scoring='f1_weighted',   # Metric: F1 Score Weighted
        cv=5,                    # 5-fold cross-validation
        n_jobs=-1,               # Use all available cores
        verbose=0,               # Print progress dismissed
        error_score= np.nan      # Ignore errors
    )
    
    grid_scv2.fit(X_train, y_train)
    y_preds = grid_scv2.predict(X_test)
    
    prec = precision_score(y_test, y_preds, average='weighted')
    recall = recall_score(y_test, y_preds, average='weighted')
    f1 = f1_score(y_test, y_preds, average='weighted')

    # Store and print results
    results3[name] = {
        'best_score': grid_scv2.best_score_,
        'best_params': grid_scv2.best_params_
    }

    print(f' -> Precision = {prec:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}')

    for name, result in results3.items():
        print(f"{name} best score: {result['best_score']:.4f}")
        print(f"{name} best params: {result['best_params']}\n") 

Starting comprehensive GridSearchCV...

Tuning Decision Tree...
 -> Precision = 0.9737, Recall = 0.9721, F1 = 0.9726
Decision Tree best score: 0.9728
Decision Tree best params: {'model__max_depth': 13, 'model__min_samples_leaf': 10, 'model__min_samples_split': 2}



320 fits failed out of a total of 3520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\burto\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\burto\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\burto\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  F

## Reports Filter

In [27]:
sys_of_interest = ['Short-Range Fire Control / Tracking', 'Target Acquisition / Tracking', 'Target Illumination'] # Use in streamlit
filtered_df = new_df[new_df['Radar_Function'].isin(sys_of_interest)]

filtered_df = filtered_df.reset_index(drop=True)
filtered_df.info()    

NameError: name 'new_df' is not defined

In [26]:
display(filtered_df.head())

NameError: name 'filtered_df' is not defined

In [57]:
log_mod = joblib.load('ml_models/logistic_regression.pkl')
tree_mod = joblib.load('ml_models/decision_tree.pkl')
forest_mod = joblib.load('ml_models/random_forest.pkl')
xgb_mod = joblib.load('ml_models/xgboost.pkl')
lgb_mod = joblib.load('ml_models/lightgbm.pkl')
cb_mod = joblib.load('ml_models/catboost.pkl')

In [58]:
model_metadata = {'size (kb)' : {'Logistic Regression' : os.path.getsize('ml_models/logistic_regression.pkl') / 1024,
                              'Decision Tree' : os.path.getsize('ml_models/decision_tree.pkl') / 1024,
                              'Random Forest' : os.path.getsize('ml_models/random_forest.pkl') / 1024,
                              'XG Boost' : os.path.getsize('ml_models/xgboost.pkl') / 1024,
                              'LightGBM' : os.path.getsize('ml_models/lightgbm.pkl') / 1024,
                              'CatBoost' : os.path.getsize('ml_models/catboost.pkl') / 1024},   
                    'accuracy' : {'Logistic Regression' : log_mod['metadata']['accuracy'],
                                  'Decision Tree' : tree_mod['metadata']['accuracy'],
                                  'Random Forest' : forest_mod['metadata']['accuracy'],
                                  'XG Boost' : xgb_mod['metadata']['accuracy'],
                                  'LightGBM' : lgb_mod['metadata']['accuracy'],
                                  'CatBoost' : cb_mod['metadata']['accuracy']},
                    'f1_score' : {'Logistic Regression' : log_mod['metadata']['f1_score_weighted'],
                                  'Decision Tree' : tree_mod['metadata']['f1_score_weighted'],
                                  'Random Forest' : forest_mod['metadata']['f1_score_weighted'],
                                  'XG Boost' : xgb_mod['metadata']['f1_score_weighted'],
                                  'LightGBM' : lgb_mod['metadata']['f1_score_weighted'],
                                  'CatBoost' : cb_mod['metadata']['f1_score_weighted']},
                    'train_time' : {'Logistic Regression' : log_mod['metadata']['train_time_sec'],
                                  'Decision Tree' : tree_mod['metadata']['train_time_sec'],
                                  'Random Forest' : forest_mod['metadata']['train_time_sec'],
                                  'XG Boost' : xgb_mod['metadata']['train_time_sec'],
                                  'LightGBM' : lgb_mod['metadata']['train_time_sec'],
                                  'CatBoost' : cb_mod['metadata']['train_time_sec']}}

model_df = pd.DataFrame(model_metadata)

In [59]:
model_df.head(6)

Unnamed: 0,size (kb),accuracy,f1_score,train_time
Logistic Regression,7.612305,0.967368,0.965573,0.298234
Decision Tree,30.139648,0.972091,0.972565,0.038235
Random Forest,2068.443359,0.978961,0.979037,0.648392
XG Boost,2033.99707,0.973379,0.973575,0.860013
LightGBM,1954.504883,0.973379,0.973622,0.61532
CatBoost,329.891602,0.975955,0.975941,2.002023
