# Featurize spectra previously written by using polynomials

### Goals:
- Generate polynomial featurized XY files

## Inputs:
- {}\_{}\_XY.json

## Outputs:
 - {}\_{}\_polynomials_XY.json

In [4]:
target_elements_groups=[('Co','O'),('Fe','O'),('V','O'),('Cu','O'),
                        ('Ni','O'),('Cr','O'),('Mn','O'),('Ti','O')]
target_metals = set(['Co','Ni','Fe','Cr','V','Mn','Cu','Ti'])
target_elements_sets =[set(pair) for pair in target_elements_groups]
write_pre_post = True

In [5]:
%load_ext autoreload
%autoreload 2
from pymatgen.ext.matproj import MPRester
from pymatgen.core import Structure
from pymatgen.analysis.structure_matcher import StructureMatcher, ElementComparator
from trixs.spectra.core import XAS_Spectrum, XAS_Collation
from trixs.spectra.util import NumpyEncoder
import matplotlib
from tqdm import tqdm, tqdm_notebook
from pprint import pprint
import json
import os
import numpy as np
import matplotlib.pyplot as plt
from trixs.spectra.spectrum_featurize import polynomialize_by_idx
from pandas import DataFrame
storage_directory = '/Users/steventorrisi/Documents/TRIXS/data/MP_OQMD_combined'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
data_by_pair = {pair:[] for pair in target_elements_groups}
for pair in target_elements_groups:
    file_target = storage_directory+'/{}_{}_XY.json'.format(pair[0],pair[1])
    with open(file_target,'r') as f:
        data_by_pair[pair] = [json.loads(line) for line in f.readlines()]


## Don't split by pre/ post edge region

In [7]:
polynomials_by_pair = {pair:[] for pair in target_elements_groups}

    
for pair in target_elements_groups:
    if write_pre_post:
        continue
    for dat in tqdm_notebook(data_by_pair[pair]):
        cur_spec = XAS_Spectrum(dat['E'],dat['mu'])
        cur_spec.normalize('max')
        poly_set = {}
        labeled_coefficients = {}
        for n in [1,2,4,5,10,20]:
            poly_set[n] = polynomialize_by_idx(cur_spec.x,cur_spec.y,N=n,deg=3)
            for poly in poly_set[n]:
                for i, coef in enumerate(poly.coef):
                    coefficient_label = 'loc:all,'+poly.label +',coef:' +str(i)
                    labeled_coefficients[coefficient_label] = coef
        dat['labeled_coefficients'] = labeled_coefficients
        #pprint(cur_polys)
        #pprint(dat)
        #assert False
        
    target_file = storage_directory + '/{}_{}_polynomials_XY.json'.format(pair[0],pair[1])
    with open(target_file,'w') as f:
        for dat in data_by_pair[pair]:
            write_data = {}
            write_data['labeled_coefficients'] = dat['labeled_coefficients']
            write_data['coordination'] = dat['coordination']
            write_data['one_hot_coord'] = dat['one_hot_coord']
            write_data['bader'] = dat['bader']
            f.write(json.dumps(write_data) + '\n')
            
            #f.write()
        

# Pre vs. Post-Edge Polynomials

In [8]:
polynomials_by_pair = {pair:[] for pair in target_elements_groups}

for pair in target_elements_groups:
    if not write_pre_post:
        continue
    for dat in tqdm_notebook(data_by_pair[pair]):
        cur_spec = XAS_Spectrum(dat['E'],dat['mu'])
        cur_spec.normalize('max')
        
        peak = cur_spec.get_peak_idx()
        #print(peak)
        if peak<20 or peak>80:
            continue
        pre_x, post_x = cur_spec.x[:peak], cur_spec.x[peak:]
        pre_y, post_y = cur_spec.y[:peak], cur_spec.y[peak:]
        
        
        poly_set = {}
        labeled_coefficients = {}
        for n in [1,2,4,5,10]:
            poly_set[n] = polynomialize_by_idx(pre_x,pre_y,N=n,deg=3,label_type='frac')
            for poly in poly_set[n]:
                for i, coef in enumerate(poly.coef):
                    coefficient_label = 'loc:pre,'+poly.label +',coef:' +str(i)
                    labeled_coefficients[coefficient_label] = coef
        
        for n in [1,2,4,5,10]:
            new_polys = polynomialize_by_idx(post_x,post_y,N=n,deg=3,label_type='frac')
            for poly in new_polys:
                for i, coef in enumerate(poly.coef):
                    coefficient_label = 'loc:post,'+poly.label +',coef:' +str(i)
                    labeled_coefficients[coefficient_label] = coef
            poly_set[n] += new_polys

        dat['labeled_coefficients'] = labeled_coefficients
        #print(dat)
    target_file = storage_directory + '/{}_{}_polynomials_pre-post_XY.json'.format(pair[0],pair[1])
    with open(target_file,'w') as f:
        for dat in data_by_pair[pair]:
            write_data = {}
            if not dat.get('labeled_coefficients', False):
                continue
            write_data['labeled_coefficients'] = dat['labeled_coefficients']
            write_data['coordination'] = dat['coordination']
            write_data['one_hot_coord'] = dat['one_hot_coord']
            write_data['bader'] = dat['bader']
            f.write(json.dumps(write_data) + '\n')
                    

HBox(children=(IntProgress(value=0, max=3174), HTML(value='')))

  res = cls._fit(xnew, y, deg, w=w, rcond=rcond, full=full)





HBox(children=(IntProgress(value=0, max=7064), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7076), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2879), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3068), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2534), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9421), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6345), HTML(value='')))


