# Spectra generator for TSMC-Net-8-L



In [1]:
%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib.ticker import MaxNLocator
import numpy as np
from numpy import asarray
import pandas as pd
import math
import seaborn as sns  #heat map
import glob # batch processing of images


import matplotlib.font_manager as fm
import random
import sys
import os

from sklearn.datasets import make_regression
import tensorflow as tf
from sklearn.metrics import confusion_matrix    #confusion matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Collect all the font names available to matplotlib
font_names = [f.name for f in fm.fontManager.ttflist]
# print(font_names)

from scipy import signal
from scipy import interpolate

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve 
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score


from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.gaussian_process.kernels import RBF

#Sklearn model saving and loading
from joblib import dump, load

if '../../' not in sys.path:
    sys.path.append('../../')

from aimos.spectral_datasets.THz_datasets import THz_data

from aimos.misc.utils import simple_plotter


#Set random seed
# os.environ['PYTHONHASHSEED'] = str(42)
# os.environ['TF_DETERMINISTIC_OPS'] = '1'
# tf.random.set_seed(42)  8
# tf.random.get_global_generator().reset_from_seed(42)
np.random.seed(42)
random.seed(42)



2022-10-20 12:18:26.606701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
from oneida import THz_mixture_data
from oneida_utils import concentrations_to_one_hot_encode, create_mixture_names

# Create training(D)+ validation(V) mixtures

In [3]:
TAAT = 0.001 
ASAT=0.01
# ASAT=0.005
RSAT=0.01
# RSAT=0.005

In [4]:
m = THz_mixture_data(resolution=0.016, pressure='1 Torr', verbosity=False)
m.initiate_THz_mixture_data(TAAT = TAAT, 
                               ASAT=ASAT, 
                               RSAT=RSAT)

reduced_labels = m.labels
reduced_labels.remove('')
reduced_labels.remove(' ')
reduced_labels.remove('Diluent')
print(reduced_labels)


Components :  [[0 1 2 3 4 5 6 7 8]]
Components shape :  (1, 9)
TAAT =  0.001
ASAT =  0.01
RSAT =  0.01
['$C_2H_5OH$', '$CH_3CHO$', '$CH_3Cl$', '$CH_3CN$', '$CH_3OH$', '$H_2CO$', '$HCOOH$', '$HNO_3$']


In [5]:



m.make_controlled_sim_mixtures(equal_amount =1000, 
                               TAAT = TAAT, 
                               ASAT=ASAT, 
                               RSAT=RSAT, 
                               save_to_file = False, debug=False, eight_component_exception=True)


 ...generating 1-component mixtures data...

total_attempts: 10774
Generator Time elasped: 2956.3039999999996

 ...generating 2-component mixtures data...

total_attempts: 88497
Generator Time elasped: 23691.744

 ...generating 3-component mixtures data...

total_attempts: 451447
Generator Time elasped: 117829.39

 ...generating 4-component mixtures data...

total_attempts: 1918168
Generator Time elasped: 500906.20999999996

 ...generating 5-component mixtures data...

total_attempts: 5874992
Generator Time elasped: 1533152.193

 ...generating 6-component mixtures data...

total_attempts: 15417996
Generator Time elasped: 4036728.3189999997

 ...generating 7-component mixtures data...

total_attempts: 47877717
Generator Time elasped: 12442903.155

 ...generating 8-component mixtures data...

total_attempts: 75314589
Generator Time elasped: 19577811.333
Time elasped: 38235978.866
Total number of test mixtures :  3270200

Combined test simulated mixtures

No. of test mixtures:  3270200
t

100%|██████████████████████████████| 3270200/3270200 [04:38<00:00, 11759.16it/s]


Time elaspsed:  279809.574
numpy random state:  165488460

reimplementing TAAT


100%|█████████████████████████████| 3270200/3270200 [00:20<00:00, 160866.52it/s]


Total spectra with weak mixture spectra: (0,)
test_mixtures data type:  object
test_mixtures shape:  (3270200, 1, 229)
test_targets data type:  object
test_targets shape:  (3270200, 8)
Adjusted n_test_mixtures:  3270200
TAAT Time elasped: 25251.597
reimplementing ASAT


100%|██████████████████████████████| 3270200/3270200 [02:57<00:00, 18456.79it/s]


Total spectra with weak components in testing dataset: (0,)
test_mixtures data type:  object
test_mixtures shape:  (3270200, 8)
test_targets data type:  object
test_targets shape:  (3270200, 8)
Adjusted n_test_mixtures after removing spectra with weak components:  3270200
ASAT Time elasped: 184624.261
reimplementing RSAT


100%|███████████████████████████████| 3270200/3270200 [06:36<00:00, 8238.75it/s]


Total spectra with weak relative components in testing dataset: (0,)
test_mixtures data type:  object
test_mixtures shape:  (3270200, 8)
test_targets data type:  object
test_targets shape:  (3270200, 8)
Adjusted n_test_mixtures after removing spectra with relatively weak speices:  3270200
RSAT Time elasped: 404505.266


In [6]:
import operator as op
from functools import reduce

def ncr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer // denom  # or / in Python 2

n_compounds = m.n_compounds
unique_mixture_numbers = ncr(n_compounds,1) + ncr(n_compounds,2) + ncr(n_compounds,3) + ncr(n_compounds,4) + ncr(n_compounds,5) +ncr(n_compounds,6) +ncr(n_compounds,7) +ncr(n_compounds,8)


# unique_mixture_numbers = ncr(n_compounds,1) + ncr(n_compounds,2) + ncr(n_compounds,3) + ncr(n_compounds,4) + ncr(n_compounds,5) + ncr(n_compounds,6) + ncr(n_compounds,7) +ncr(n_compounds,8) + ncr(n_compounds,9) + ncr(n_compounds,10) + ncr(n_compounds,11) + ncr(n_compounds,12)

unique_1C_mixture_numbers = ncr(n_compounds,1) 
unique_2C_mixture_numbers = ncr(n_compounds,2) 
unique_3C_mixture_numbers = ncr(n_compounds,3) 
unique_4C_mixture_numbers = ncr(n_compounds,4) 
unique_5C_mixture_numbers = ncr(n_compounds,5) 
unique_6C_mixture_numbers = ncr(n_compounds,6) 
unique_7C_mixture_numbers = ncr(n_compounds,7) 
unique_8C_mixture_numbers = ncr(n_compounds,8) 




print('Total 1-C combinations:', unique_1C_mixture_numbers)
print('Total 2-C combinations:', unique_2C_mixture_numbers)
print('Total 3-C combinations:', unique_3C_mixture_numbers)
print('Total 4-C combinations:', unique_4C_mixture_numbers)
print('Total 5-C combinations:', unique_5C_mixture_numbers)
print('Total 6-C combinations:', unique_6C_mixture_numbers)
print('Total 7-C combinations:', unique_7C_mixture_numbers)
print('Total 8-C combinations:', unique_8C_mixture_numbers)

print('Total combinations:', unique_mixture_numbers)

Total 1-C combinations: 8
Total 2-C combinations: 28
Total 3-C combinations: 56
Total 4-C combinations: 70
Total 5-C combinations: 56
Total 6-C combinations: 28
Total 7-C combinations: 8
Total 8-C combinations: 1
Total combinations: 255


# view attributes of the controlled test mixtures data

In [7]:
print('0number of test mixtures: ',m.n_test_mixtures)
print('test mixtures shape: ',m.test_mixtures.shape)
print('test targets shape: ',m.test_targets.shape)
print('test dilutions shape: ',m.test_dilution.shape)

print('resolution: ',m.resolution)
print('frequencies in the data', m.frequencies)

print('pressure: ',m.pressure) 
print('labels: ',m.labels) 
print('label_id: ',m.label_id) 

print('number of features: ',m.n_features) 
print('no. of compounds: ',m.n_compounds)
print('no. of spectrum per compound in pure THz data: ' ,m.n_spectrum)
print('no. of spectra in pure THz data: ',m.n_spectra) 

print('number ot maximum mixture components: ',m.n_mixture_component_max)
print('integer indices for each of the mixture components',m.components)


0number of test mixtures:  3270200
test mixtures shape:  (3270200, 229)
test targets shape:  (3270200, 8)
test dilutions shape:  (3270200,)
resolution:  0.016
frequencies in the data [ 7.352  7.368  7.384  7.4    7.416  7.432  7.448  7.464  7.48   7.496
  7.512  7.528  7.544  7.56   7.576  7.592  7.608  7.624  7.64   7.656
  7.672  7.688  7.704  7.72   7.736  7.752  7.768  7.784  7.8    7.816
  7.832  7.848  7.864  7.88   7.896  7.912  7.928  7.944  7.96   7.976
  7.992  8.008  8.024  8.04   8.056  8.072  8.088  8.104  8.12   8.136
  8.152  8.168  8.184  8.2    8.216  8.232  8.248  8.264  8.28   8.296
  8.312  8.328  8.344  8.36   8.376  8.392  8.408  8.424  8.44   8.456
  8.472  8.488  8.504  8.52   8.536  8.552  8.568  8.584  8.6    8.616
  8.632  8.648  8.664  8.68   8.696  8.712  8.728  8.744  8.76   8.776
  8.792  8.808  8.824  8.84   8.856  8.872  8.888  8.904  8.92   8.936
  8.952  8.968  8.984  9.     9.016  9.032  9.048  9.064  9.08   9.096
  9.112  9.128  9.144  9.16   9.176 

# Create one hot encode and mixture labels

In [8]:
X=m.test_mixtures

y_concentrations=m.test_targets

print('X shape:', X.shape)
print('y_concentrations shape:', y_concentrations.shape)

print('X dtype:', X.dtype)
print('y_concentrations dtype:', y_concentrations.dtype)

X=X.astype(np.float)
y_concentrations=y_concentrations.astype(np.float)

print('After data type conversion')
print('X dtype:', X.dtype)
print('y_concentrations dtype:', y_concentrations.dtype)




X shape: (3270200, 229)
y_concentrations shape: (3270200, 8)
X dtype: object
y_concentrations dtype: object
After data type conversion
X dtype: float64
y_concentrations dtype: float64


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X=X.astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_concentrations=y_concentrations.astype(np.float)


In [9]:
y_ohe = concentrations_to_one_hot_encode(y_concentrations)

In [10]:
from tqdm import tqdm
mixture_names= []
for _ in tqdm(range(y_ohe.shape[0])):
    mixture_names.append(create_mixture_names(y_ohe[_],m.n_mixture_component_max,reduced_labels))

    
# for _ in range(y_ohe.shape[0]):
#     print(mixture_names[_])

100%|█████████████████████████████| 3270200/3270200 [00:11<00:00, 283775.59it/s]


In [11]:
len(mixture_names)

3270200

# encode mixture names as integer index

In [12]:


from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(mixture_names)
y = le.transform(mixture_names)

# le.inverse_transform(y)
mixture_types=le.classes_
print(mixture_types)

['$CH_3CHO$' '$CH_3CHO$+$CH_3CN$' '$CH_3CHO$+$CH_3CN$+$CH_3OH$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$HCOOH$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$HCOOH$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$H_2CO$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$H_2CO$+$HCOOH$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$H_2CO$+$HCOOH$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$CH_3OH$+$H_2CO$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$HCOOH$' '$CH_3CHO$+$CH_3CN$+$HCOOH$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$HNO_3$' '$CH_3CHO$+$CH_3CN$+$H_2CO$'
 '$CH_3CHO$+$CH_3CN$+$H_2CO$+$HCOOH$'
 '$CH_3CHO$+$CH_3CN$+$H_2CO$+$HCOOH$+$HNO_3$'
 '$CH_3CHO$+$CH_3CN$+$H_2CO$+$HNO_3$' '$CH_3CHO$+$CH_3Cl$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$' '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$+$HCOOH$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$+$HCOOH$+$HNO_3$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$+$HNO_3$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$+$H_2CO$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH$+$H_2CO$+$HCOOH$'
 '$CH_3CHO$+$CH_3Cl$+$CH_3CN

In [13]:
print(mixture_types.shape)

(255,)


In [14]:
# for _ in range(y_ohe.shape[0]):
#     print(mixture_names[_],integers[_])

In [15]:
print(y.shape)

print(X.shape)

(3270200,)
(3270200, 229)


In [16]:
final_neuron_number = np.unique(y, axis=0).shape[0]
print(final_neuron_number)

255


# preview one test mixture spectra using simple plotter

In [17]:
# simple_plotter(m.frequencies,m.test_mixtures[0],linewidth=0.5,color='black',label='$test-mixture$', 
#                    majorsize=6,minorsize=2,width=1, labelsize=8,legendsize=3, legendloc=2,  
#                    labelpad=4,fontsize='medium',fontweight='bold',
#                   xmajormplloc=0.5,xminormplloc=0.2, tickdirection='out')

# select equal amount of mixtures for each compound

In [18]:
# #see distribution of counts of different types of mixtures.
# plt.barh(mixture_types,np.bincount(y))
# # plt.xticks(rotation=90);
# plt.xticks(rotation=0);

In [19]:
y_concentrations[:,0].shape

(3270200,)

In [20]:
spectra_df = pd.DataFrame(X)
spectra_df['y'] = y
spectra_df['mixture_names'] = mixture_names
spectra_df['y_c0'] = m.test_targets[:,0]
spectra_df['y_c1'] = m.test_targets[:,1]
spectra_df['y_c2'] = m.test_targets[:,2]
spectra_df['y_c3'] = m.test_targets[:,3]
spectra_df['y_c4'] = m.test_targets[:,4]
spectra_df['y_c5'] = m.test_targets[:,5]
spectra_df['y_c6'] = m.test_targets[:,6]
spectra_df['y_c7'] = m.test_targets[:,7]


In [21]:
spectra_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,y,mixture_names,y_c0,y_c1,y_c2,y_c3,y_c4,y_c5,y_c6,y_c7
0,0.256783,0.368530,0.060259,2.168760e-02,1.097927e-02,0.006601,0.004399,0.003139,0.002352,0.001827,...,64,$CH_3CN$,0.0,0.0,0.0,0.398885,0.0,0.0,0.0,0.0
1,0.305939,0.439078,0.071794,2.583926e-02,1.308103e-02,0.007865,0.005242,0.003740,0.002802,0.002177,...,64,$CH_3CN$,0.0,0.0,0.0,0.475244,0.0,0.0,0.0,0.0
2,0.515211,0.739422,0.120903,4.351419e-02,2.202889e-02,0.013245,0.008827,0.006299,0.004719,0.003666,...,64,$CH_3CN$,0.0,0.0,0.0,0.800326,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,...,80,$CH_3Cl$,0.0,0.0,0.191603,0.0,0.0,0.0,0.0,0.0
4,0.000028,0.000002,0.000000,1.255538e-07,1.414462e-10,0.000000,0.000000,0.000000,0.000000,0.000237,...,251,$H_2CO$,0.0,0.0,0.0,0.0,0.0,0.74122,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3270195,0.078775,0.108049,0.019521,7.920825e-03,4.838596e-03,0.003992,0.003562,0.005230,0.003731,0.004684,...,146,$C_2H_5OH$+$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH...,0.393651,0.023135,0.057636,0.113995,0.063093,0.068957,0.128024,0.107504
3270196,0.037216,0.050914,0.009577,3.759751e-03,2.437007e-03,0.002494,0.001970,0.005701,0.003400,0.006084,...,146,$C_2H_5OH$+$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH...,0.422942,0.071482,0.007147,0.053415,0.08788,0.023464,0.174425,0.033699
3270197,0.013880,0.018697,0.003768,1.488650e-03,1.021004e-03,0.001248,0.000914,0.002795,0.001867,0.003608,...,146,$C_2H_5OH$+$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH...,0.390722,0.02881,0.255971,0.019338,0.048891,0.053229,0.155323,0.013345
3270198,0.058426,0.079474,0.014346,5.769751e-03,3.580779e-03,0.003485,0.002561,0.003793,0.002880,0.004558,...,146,$C_2H_5OH$+$CH_3CHO$+$CH_3Cl$+$CH_3CN$+$CH_3OH...,0.39349,0.014279,0.060891,0.083851,0.154919,0.027617,0.142614,0.067553


In [22]:
mixtures_df_list = []

In [23]:
mixtures_df_list = []
for label in mixture_types:
    mixtures_df_list.append(spectra_df[spectra_df['mixture_names'] ==label] )

In [43]:
class_cnt=90
filtered_mixtures_df_list = []
for df in mixtures_df_list:
    filtered_mixtures_df_list.append(df.sample(n=class_cnt))

filtered_spectra_df = pd.concat(filtered_mixtures_df_list)    

In [None]:
filtered_spectra_df

In [32]:
import datetime
from datetime import date, datetime


today = date.today()
now = datetime.now()



# print("now =", now)
# dd/mm/YY H:M:S
dt_string = now.strftime("%d-%m-%Y_time_%H-%M-%S")
# print("date and time =", dt_string)	



#TSMCN-5-L-229_DV_ = THz Spectra of Mixture Classifier Network, 5 compounds, low res, 229 features, training(D) and validation (V) data combined

data_identifier = 'datasets' + '/' + 'TSMCN-8-L-229_DV_' + '_TAAT_'+ str(TAAT) + '_ASAT_'+ str(ASAT) + '_RSAT_'+ str(RSAT) + '_' + dt_string + '_' + 'class_cnt_' + str(class_cnt)


filtered_spectra_df.to_pickle(data_identifier + '.pkl')  

# notebook ends!