<a href="https://colab.research.google.com/github/ced-sys/SubTerra/blob/main/SubTerra_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
try:
  from xgboost import XGBClassifier
  XGBOOST_AVAILABLE=True
except ImportError:
  XGBOOST_AVAILABLE=False
  print("Warning: XGBoost not available. Install with: pip install xgboost")

In [None]:
model={}
X_train=None
X_test=None
y_train=None
y_test=None
feature_names=None
scaler=StandardScaler()

In [None]:
def load_and_preprocess_data(filepath='/content/training_dataset.csv'):
  global feature_names

  try:
    print(f"Loading dataset from: {filepath}")
    df=pd.read_csv(filepath)
    print(f"Dataset shape: {df.shape}")

    #Remove non-feature columns
    drop_cols=[col for col in df.columns
               if any (keyword in col.lower()
               for keyword in ['fid', 'path', 'layer', 'id'])]

    if drop_cols:
      print(f"Dropping columns: {drop_cols}")
      df_cleaned=df.drop(columns=drop_cols)
    else:
      df_cleaned=df.copy()

    #Separate features and labels
    if 'label' not in df_cleaned.columns:
      raise ValueError("No 'label' column found in dataset")

    X=df_cleaned.drop(columns=['label'])
    y=df_cleaned['label']

    #convert to numeric and handle missing values
    X=X.apply(pd.to_numeric, errors='coerce')

    #Handle missing values
    missing_counts=X.isnull().sum()
    if missing_counts.any():
      print(f"Missing values found: {missing_counts[missing_counts>0]}")
      X=X.fillna(X.median())

    feature_names=list(X.columns)
    print(f"Features: {len(feature_names)}")
    print(f"Label distribution: {y.value_counts().to_dict()}")

    return X, y

  except FileNotFoundError:
    raise FileNotFoundError(f"Dataset file not found: {filepath}")
  except Exception as e:
    raise Exception(f"Error loading dataset: {str(e)}")





In [1]:
def split_data(X, y, test_size=0.25, random_state=42):
  global X_train, X_test, y_train, y_test, scaler

  X_train, X_test, y_train, y_test=train_test_split(
      X, y, test_size=test_size, random_state=random_state,
      stratify=y, shuffle=True
  )

  #Scale features for better perfoemance
  X_train_scaled=scaler.fit_transform(X_train)
  X_test_scaled=scaler.transform(X_test)

  print(f"Training set: {X_train.shape[0]} samples")
  print(f"Testing set: {X_test.shape[0]} samples")

  return X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled

In [2]:
def train_models(X_train, y_train, X_train_scaled, random_state=42):
  global models

  print("\nTraining models...")

  #Decision tree
  models['decision_tree']=DecisionTreeClassfier(
      max_depth=8,
      min_samples_split=10,
      min_samples_leaf=5,
      random_state=random_state
  )

  #Random forest
  models['random_forest']=RandomForestClassifier(
      n_estimators=100,
      max_depth=10,
      min_samples_split=10,
      random_state=random_state,
      n_jobs=-1
  )

  #XGBoost
  if XGBOOST_AVAILABLE:
    models['wgboost']=XGBClassifer(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=random_state,
        eval_metric='logloss',
        use_label_encoder=False
    )

  #Train all models
  for name, model in models.items():
    print(f"Training {name}...")
    if name == 'xgbost':
      model.fit(X_train_scaled, y_train)
    else:
      model.fit(X_train, y_train)

  print("All models trained successfully")
  return models