## Import relevant libraries for data pre-processing
   - A couple of libraries that were not installed via conda were attempted with pip

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.cm as cmx
from mpl_toolkits import mplot3d

from tqdm import tqdm
from mpl_toolkits.mplot3d import Axes3D

import os
import struct
import re
import warnings

#import mpl_scatter_density
from scipy import stats
from scipy.optimize import curve_fit
from scipy import asarray as ar,exp
from scipy.stats import norm


import apav as ap

from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

## Read the 'raw files' obtained from atom probe tomography experiments
   - The file corresponds to measured 18 million time resolved events as data entries (.epos file)
   - The mass-spectrum range file must also be read (.rrng) 

In [2]:
"""
The function reads a user specfied .rrng file containing every labelled peak range in terms of mass/charge ratio.
"""
def read_rrng(f):
    rf = open(f,'r').readlines()
    patterns = re.compile(r'Ion([0-9]+)=([A-Za-z0-9]+).*|Range([0-9]+)=(\d+.\d+) +(\d+.\d+) +Vol:(\d+.\d+) +([A-Za-z:0-9 ]+) +Color:([A-Z0-9]{6})')
    ions = []
    rrngs = []
    for line in rf:
        m = patterns.search(line)
        
        if m:
            if m.groups()[0] is not None:
                ions.append(m.groups()[:2])
            else:
                rrngs.append(m.groups()[2:])
                
    ions = pd.DataFrame(ions, columns=['number','name'])
    ions.set_index('number',inplace=True)
    
    rrngs = pd.DataFrame(rrngs, columns=['number','lower','upper','vol','comp','colour'])
    rrngs.set_index('number',inplace=True) 
    
    rrngs[['lower','upper','vol']] = rrngs[['lower','upper','vol']].astype(float)
    rrngs[['comp','colour']] = rrngs[['comp','colour']].astype(str)
    
    return ions,rrngs

In [3]:
"""
The function reads a user specfied .epos file containing spatial information.
"""
def read_epos(file_name):
    
    f = open(file_name, 'rb')
    
    dt_type = np.dtype({'names':['x', 'y', 'z', 'm'], 
                  'formats':['>f4', '>f4', '>f4', '>f4']})
    
    epos_new = np.fromfile(f, dt_type, -1)
    f.close()
    
    return epos_new

In [4]:
read_epos(r"R76_22213-v11.epos")

array([( 4.30381626e-01,  4.2453604e+00, 3.270586e-01, 6.2941795e+01),
       ( 1.00510205e+03,  2.9097000e+03, 0.000000e+00, 2.6488397e-01),
       ( 1.60342274e+01,  1.8468962e-37, 1.401298e-45, 9.4592991e+00),
       ...,
       (-1.04061556e+01, -3.5342262e+00, 1.401298e-45, 1.4012985e-45),
       ( 3.84736347e+00, -8.6444101e+00, 9.564320e+01, 6.2904789e+01),
       ( 8.48302246e+02,  3.9956599e+03, 0.000000e+00, 4.5030274e+00)],
      dtype=[('x', '>f4'), ('y', '>f4'), ('z', '>f4'), ('m', '>f4')])

In [5]:
read_rrng('R76_22213.rrng')

(       name
 number     
 1        Cr
 2        Cu
 3        Fe
 4        Ni
 5        Co
 6        Mn
 7         B
 8        Be
 9         C
 10        V
 11       Zr
 12       Nb
 13       Zn
 14       Ga
 15       Ti,
          lower   upper      vol  comp  colour
 number                                       
 1       25.900  26.307  0.01201  Cr:1  FF33CC
 2       26.393  26.726  0.01201  Cr:1  FF33CC
 3       24.851  25.171  0.01201  Cr:1  FF33CC
 4       51.795  52.211  0.01201  Cr:1  FF33CC
 5       52.705  53.151  0.01201  Cr:1  FF33CC
 6       49.767  50.124  0.01201  Cr:1  FF33CC
 7       62.655  63.654  0.01181  Cu:1  FF6600
 8       64.687  65.521  0.01181  Cu:1  FF6600
 9       31.310  31.718  0.01181  Cu:1  FF6600
 10      32.450  32.651  0.01181  Cu:1  FF6600
 11      27.878  28.287  0.01177  Fe:1  FF00FF
 12      28.397  28.699  0.01177  Fe:1  FF00FF
 13      26.827  27.146  0.01177  Fe:1  FF00FF
 14      55.862  56.156  0.01177  Fe:1  FF00FF
 15      56.780  57.096  0

In [6]:
"""
The total number of data entries are 18.96 million.

Note: saving the data frame into excel is not even a remote possibility, 
as the latter can accomodate not beyond 2 million data entires. 
"""

print("The total number of data entires are: {:.2f} million".format(len(read_epos(r"R76_22213-v11.epos"))/1000000))

The total number of data entires are: 18.97 million


In [7]:
""" 
Label each field evaporated ion corresponding to a given atom located in the parent material,
before destructive APT test is conducted.
"""
def label_ions(pos,rrngs):

    count=0;
    for n,r in rrngs.iterrows():
        count= count+1;
        pos.loc[(pos.Da >= r.lower) & (pos.Da <= r.upper),['comp','colour', 'nature']] = [r['comp'],'#' + r['colour'],count]
    
    return pos

### Co-ordinates scaling and verification

- The relation between 'detector space' (40mm) and 'sample space' (40nm) must be specified.
- The detector range for 'x' and 'y' are `4000 units` each = `40 nm` each, in the sample material. 
    - Therefore, 'x' and 'y', both must be scaled down by a factor of 100. 
    - Eventually, we have 'x' and 'y' coordinates ranging across 40 nm each (40 nm diameter; 20 nm radius), while the sample depth 'z' can vary up to 150 nm. 
- Reference: [link](https://www.microscopy.org/communities/presentations/2016_Larson-APT%20Data%20Reconstruction.pdf)

In [8]:
rrange_file = 'R76_22213.rrng'
ions, rrngs = read_rrng(rrange_file)
epos_file = "R76_22213-v11.epos"
pos = read_epos(epos_file)


dpos = pd.DataFrame({'x':pos['x'],
                            'y': pos['y'],
                            'z': pos['z'],
                            'Da': pos['m']})

# co-ordinate scaling: This important step was earlier missed
dpos['x'] = dpos['x']/100
dpos['y'] = dpos['y']/100

In [9]:
lpos = label_ions(dpos,rrngs)

In [10]:
lpos.head()

Unnamed: 0,x,y,z,Da,comp,colour,nature
0,0.004304,0.0424536,0.3270586,62.941795,Cu:1,#FF6600,7.0
1,10.051021,29.097,0.0,0.264884,,,
2,0.160342,1.8468960000000002e-39,1.401298e-45,9.459299,,,
3,0.04034,0.01956822,62.91315,1019.15918,,,
4,29.097,0.0,16.64985,15.811012,,,


### Criteria definition for noise identification and removal from "Big dataset"

In [11]:
"""
Retaining only data entries those are labelled as elements, and removal of noisy data entires.
"""
lpos = lpos[lpos['nature'].notna()]

In [12]:
lpos.head()

Unnamed: 0,x,y,z,Da,comp,colour,nature
0,0.004304,0.0424536,0.3270586,62.941795,Cu:1,#FF6600,7.0
11,-0.130098,0.05683204,3.866135,62.952465,Cu:1,#FF6600,7.0
22,-0.013294,-0.1409252,3.84161,62.890915,Cu:1,#FF6600,7.0
33,-0.09271,-0.07092165,2.549284,64.947853,Cu:1,#FF6600,8.0
35,-0.046875,2.802597e-45,1.401298e-45,9.955543,B:1,#0033FF,32.0


In [13]:
"""
The function accepts a string, corresponding to a mass-spectrum peak label, and retains the element information only. 
If there are multiple peaks for a given element, only elemental information gets returned.
"""
def spectrumPeak_to_Element(input_string):
    if input_string[2] == ':':
        return input_string[:2]
    elif input_string[1] == ':':
        return input_string[:1]
    else:
        return 'check'

In [14]:
lpos['comp'] = lpos['comp'].apply(spectrumPeak_to_Element)
lpos.drop(columns=['Da','colour','nature'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lpos['comp'] = lpos['comp'].apply(spectrumPeak_to_Element)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
lpos.head(3)

Unnamed: 0,x,y,z,comp
0,0.004304,0.042454,0.327059,Cu
11,-0.130098,0.056832,3.866135,Cu
22,-0.013294,-0.140925,3.84161,Cu


In [16]:
"""
counts of each atom present in the material. 
"""
lpos['comp'].value_counts()

Cu    1636941
B      137507
Cr     102744
C      102294
Fe      89024
Be      61632
Ni      47063
Ti      24831
Mn      24142
Zr      20192
Zn      12834
V       11800
Co      10366
Ga       9816
Nb       7563
Name: comp, dtype: int64

### Data preparation for clustering algorithm &  scale-calibration verification

In [17]:
lpos.iloc[:,:3].values

array([[ 4.3038162e-03,  4.2453602e-02,  3.2705861e-01],
       [-1.3009785e-01,  5.6832042e-02,  3.8661351e+00],
       [-1.3293718e-02, -1.4092521e-01,  3.8416100e+00],
       ...,
       [ 4.4720151e-02, -1.1847691e-01,  9.6605995e+01],
       [ 3.8473636e-02, -8.6444102e-02,  9.5643204e+01],
       [ 8.4830227e+00,  3.9956600e+01,  0.0000000e+00]], dtype=float32)

In [18]:
points = lpos.iloc[:,:3].values

In [19]:
# x min and x max
print("x-coordinate range: {} to {}".format(points[:,0].min(),points[:,0].max()))

# y min and y max
print("y-coordinate range: {} to {}".format(points[:,1].min(),points[:,1].max()))

# z min and z max
print("z-coordinate range: {} to {}".format(points[:,2].min(),points[:,2].max()))

x-coordinate range: -0.20472830533981323 to 39.956600189208984
y-coordinate range: -0.20554803311824799 to 39.956600189208984
z-coordinate range: -28.229061126708984 to 101.36006927490234


- We may note here that 'x' and 'y' range are 40 each (in nm)

### Save processed data for clustering

In [20]:
lpos.iloc[:,:3].to_pickle('processed_Cu-Cr-data_clustering_x_y_z.pkl')