In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

In [2]:
file_path = "sampelKeladiTikus.xlsx"
df = pd.read_excel(file_path)

In [3]:
df

Unnamed: 0,Retention Time,m/z,Real_m/z,Intensity,Label
0,5.022,1103.096680,1103.25903,191,0
1,5.022,1029.343994,1029.18005,265,0
2,5.022,958.753662,958.51367,191,0
3,5.022,914.539917,914.58844,191,0
4,5.022,844.898560,845.10999,191,0
...,...,...,...,...,...
12452,2401.000,306.806580,306.97055,20070,0
12453,2401.000,268.981384,269.11771,7801,0
12454,2401.000,206.614044,206.32800,22472,0
12455,2401.000,149.180481,149.19000,38204,0


In [3]:
df.columns

Index(['Retention Time', 'm/z', 'Real_m/z', 'Intensity', 'Label'], dtype='object')

### Input features (feature) & convert to Array

In [4]:
df.columns[0:4]

Index(['Retention Time', 'm/z', 'Real_m/z', 'Intensity'], dtype='object')

In [5]:
data = df[["Retention Time", 'm/z', 'Real_m/z', 'Intensity']].to_numpy()

### Output variable (the Class label) & convert to Array

In [6]:
df.columns[4]

'Label'

In [7]:
target = df["Label"].to_numpy()

In [8]:
X = data
Y = target

### Examine the data dimension

In [9]:
X.shape

(12457, 4)

In [10]:
Y.shape

(12457,)

## Build Classification Model using Random Forest

In [11]:
clf = RandomForestClassifier()

In [12]:
clf.fit(X, Y)

RandomForestClassifier()

## Feature Importance

In [13]:
print(clf.feature_importances_)

[0.08430181 0.39354032 0.3980748  0.12408306]


## Make Prediction

In [14]:
X[0]

array([   5.022     , 1103.09667969, 1103.25903   ,  191.        ])

In [15]:
print(clf.predict([[15, 121, 122, 144700]]))

[1]


In [16]:
print(clf.predict(X[[0]]))

[0]


In [17]:
print(clf.predict_proba(X[[0]]))

[[1. 0.]]


In [18]:
clf.fit(data, target)

RandomForestClassifier()

## Data split (80/20 ratio)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [20]:
X_train.shape, Y_train.shape

((9965, 4), (9965,))

In [21]:
X_test.shape, Y_test.shape

((2492, 4), (2492,))

## Rebuild the Random Forest Model

In [22]:
clf.fit(X_train, Y_train)

RandomForestClassifier()

### 9.1. Performs prediction on single sample from the data set

In [23]:
print(clf.predict([[15, 121, 122, 144700]]))

[1]


In [24]:
print(clf.predict_proba([[15, 121, 122, 144700]]))

[[0.42 0.58]]


### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [25]:
print(clf.predict(X_test))

[0 0 0 ... 0 0 0]


#### *Actual class labels*

In [26]:
print(Y_test)

[0 0 0 ... 0 0 0]


## 10. Model Performance

In [27]:
print(clf.score(X_test, Y_test))

0.9911717495987159
