## Importing the Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

## Importing the Dataset

In [3]:
df = pd.read_csv('avalanche_data_clean.csv')

In [4]:
df.head()

Unnamed: 0,snow_type_dry,snow_type_unknown,snow_type_wet,trigger_type_explosive,trigger_type_natural,trigger_type_unknown,max_elevation,min_elevation,aspect_degrees,length,width,perimeter,area,aval_size_class,weight_AAI,risk_index
0,1,0,0,0,1,0,2562.0,2484.0,42.0,101.0,123.0,355.0,8762.0,2,2,2
1,1,0,0,0,1,0,2494.0,2356.0,21.0,127.0,351.0,834.0,30522.0,3,3,2
2,1,0,0,0,0,0,2115.0,2017.0,200.0,166.0,85.0,454.0,7837.0,2,2,2
3,1,0,0,0,1,0,2085.0,1986.0,42.0,128.0,18.0,265.0,1522.0,2,2,2
4,1,0,0,0,1,0,2605.0,2529.0,240.0,127.0,63.0,324.0,5929.0,2,2,2


In [5]:
df.shape

(13608, 16)

## Splitting the features and label from the dataset

In [6]:
X = df.drop(columns=['risk_index'])
y = df['risk_index']

## Splitting the dataset into training set and test set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

## Feature Scaling

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## GridSearch on K-Nearest Neighbors

In [9]:
classifier = KNeighborsClassifier()

# Define the hyperparameter grid for the classifier
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# Perform grid search with cross-validation
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the best model on the test data
best_classifier = grid_search.best_estimator_
y_pred = best_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Display the best hyperparameters and accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy on Test Data:", accuracy)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Best Hyperparameters: {'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'uniform'}
Accuracy on Test Data: 0.4588537839823659


## GridSearch on Gaussian Naive Bayes

In [10]:
classifier = GaussianNB()

# There are no hyperparameters to tune for GaussianNB, so no need for a param_grid

# Perform grid search with cross-validation (no hyperparameters to tune)
grid_search = GridSearchCV(classifier, param_grid={}, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the best model on the test data
best_classifier = grid_search.best_estimator_
y_pred = best_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Display the best model and accuracy
print("Best Model:", best_classifier)
print("Accuracy on Test Data:", accuracy)

Best Model: GaussianNB()
Accuracy on Test Data: 0.44930198383541514


In [12]:
df.shape

(13608, 16)