# Random Forest

This script is to perform Random Forest for the processed embedding array.

In [16]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_covtype
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_approximation import PolynomialCountSketch
from sklearn.linear_model import LogisticRegression
from scipy.stats import randint
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV
import time

## Loading array from csv file

In [2]:
## Loading embedding arrays
with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/X_spectral_embeddings.csv') as file_name:
    X_embedding = np.loadtxt(file_name, delimiter=",")
print(X_embedding.shape)

(16360, 300)


In [3]:
## Loading class labels
import csv

with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/y_npc_class_results_labels.csv') as file_name:
    csv_label = csv.reader(file_name)
    y_label_list = []
    for row in csv_label:
        y_label_list.append(row[0])
y_label = np.array(y_label_list)
print(y_label.shape)

(16360,)


## Get the proper parameters

In [20]:
clf = RandomForestClassifier(n_estimators=10, random_state=0)

param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 11),
              "min_samples_split": randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

rsh = HalvingRandomSearchCV(estimator=clf, param_distributions=param_dist,
                            factor=2, random_state=0)
rsh.fit(X_embedding, y_label_list)
rsh.best_params_



{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 10,
 'min_samples_split': 5}

## Random Forest for training and test sets

In [5]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_embedding, y_label, test_size = 0.25, random_state = 0)



In [7]:
X_test.shape

(4090, 300)

In [21]:
# Feature Scaling

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [35]:
## Train Model
import time
t0=time.process_time()
# Instantiate model with 10 decision trees
rf = RandomForestClassifier(n_estimators = 160, random_state = 0)
# Train the model on training data
rf.fit(X_train, y_train)
t1=time.process_time()-t0
y_pred = rf.predict(X_test)

print('Processing time: %.3f' % t1)

Processing time: 267.768


In [36]:
## Evaluate the model

print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))


Accuracy: 0.305
