In [8]:
import apd_utils as apd
import numpy as np 
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

## 1. Generate a referenced dataset and a random sample and perform DBSCAN
The first step is to generate a reference dataset and set up a generic analytical worflow. To evaluate a sample, only 5 steps are required as shown below. Note that in this step we generated a non-adulterated sample so it is expected that the label of the sample predicted by DBSCAN is identical to the lables of referenced samples. 

In [12]:
# 1. Generate a referenced dataset to represent authentic products
data_ref = apd.generate_ref_data(num_sample=50, num_analyte=200, seed=32)

# 2. Generate a non-adulterated random sample
sample_ref = apd.generate_random_sample(data_ref, adulterated=False)

# 3. Combine referenced data and the random sample
data_comb = np.concatenate((data_ref, sample_ref), axis=0)

# 4. Scale the data  
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_comb)

# 5. Perform DBSCAN on scaled data
dbscan = DBSCAN(eps=20, min_samples=-1).fit(scaled_data)

print(f"Labels for referenced data: {dbscan.labels_[0:50]}")
print(f"Label for the non-adulterated sample: {dbscan.labels_[-1]}")

Labels for referenced data: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Label for the non-adulterated sample: 0
