# **Full feature extraction**

In [1]:
import pandas
import numpy as np
from scipy.interpolate import griddata
from scipy.interpolate import interp1d
from scipy.interpolate import make_splrep
from scipy.integrate import quad
from scipy.stats import kurtosis
t_df = pandas.read_csv('/kaggle/input/sample-1/HistoryOutput.csv')
c_df = pandas.read_csv('/kaggle/input/sample-1/cd_query_1.csv')
out_df = pandas.DataFrame(columns=['t_max', 't*', 'FWHM @ t*', 'FWHM @ 2t*',
                                   'Intg @ t*', 'Intg @ 2t*', 'Derv_1 @ t*', 
                                   'Derv_2 @ t*', 'Derv_1 @ 2t*', 'Derv_2 @ 2t*',
                                   'Depth at t*', 'Depth at 2t*', 'Depth/width at t*', 
                                   'Depth/width at 2t*', 'Kurtosis at t*', 'Kurtosis at 2t*'])

In [2]:
def max_temp_difference(y):
    #Returns the maximum difference in temperature between above defect and not above defect
    max_diff = np.max(y)
    return max_diff

def max_temp_difference_time(x, y):
    #Returns timestep of max_temp_difference
    max_index = np.argmax(y)
    max_x = x[max_index]
    return max_x

def calculate_fwhm(x, y, baseline):
    #returns the width at half max amplitude, as well as 2 x values of half max 
    min_y = np.nanmin(y)
    min_idx = np.argmin(y)

    half_trough_value = min_y + ((baseline - min_y) / 2.0)

    left_indices = np.where(y[:min_idx] <= half_trough_value)[0]
    
    right_indices = np.where(y[min_idx:] <= half_trough_value)[0] + min_idx
    
    x_left = x[left_indices[0]]
    x_right = x[right_indices[-1]]

    fwhm = x_right - x_left
    return fwhm, x_right, x_left

def intersection(x, y, baseline):
    #returns values where curve intersects with baseline
    min_y = np.nanmin(y)
    min_idx = np.argmin(y)
    
    left_intersection = np.where(y[:min_idx] <= baseline)[0]
    right_intersection = np.where(y[min_idx:] <= baseline)[0] + min_idx
    
    x_left = []
    if x[left_intersection[0]] != -50:
        x_left = x[left_intersection]
        x_left = x_left[0]
    elif len(left_intersection) > 1:
        for i in range(len(left_intersection) - 1):
            if abs(x[left_intersection[-i-2]]) - abs(x[left_intersection[-i-1]]) > 2:
                x_left.append(x[left_intersection[-i-1]])
        x_left = x_left[0]
    
    x_right = []
    if x[right_intersection[-1]] != 50:
        x_right = x[right_intersection]
        x_right = x_right[-1]
    elif len(right_intersection) > 1:
        for i in range(len(right_intersection) - 1):
            if abs(x[right_intersection[i+1]]) - abs(x[right_intersection[i]]) > 2:
                x_right.append(x[right_intersection[i]])
        x_right = x_right[0]

    return x_left, x_right

def curve_depth(x, y, baseline):
    #returns depth of curve from baseline to minimum 
    min_y = np.nanmin(y)
    depth = baseline - min_y
    return depth

def d_w_ratio(x, y, baseline):
    #returns ratio of depth and width @ half max amplitude
    fwhm = calculate_fwhm(x, y, baseline)
    w = fwhm[0]
    d = curve_depth(x, y, baseline)
    ratio = d/w
    return ratio
    
def calculate_i_d(x, y, baseline, fwhm, spl):
    #returns integral between curve and baseline and derivative at start and end of curve 
    def integrand(z):
        return abs(baseline-spl(z))

    point = intersection(x, y, baseline)
    result_1, error_1 = quad(integrand, point[0], fwhm[2]/2)
    result_2, error_2 = quad(integrand, fwhm[2]/2, fwhm[2])
    result_3, error_3 = quad(integrand, fwhm[2], fwhm[1])
    result_4, error_4 = quad(integrand, fwhm[1], point[1])
    total_integral = result_1 + result_2 + result_3 + result_4
    
    first_derivative_spline = spl.derivative(nu=1)
    derivative_at_start = first_derivative_spline(point[0])
    derivative_at_end = first_derivative_spline(point[1])
    
    return total_integral, derivative_at_start, derivative_at_end

def calculate_kurtosis(x, y, baseline):
    #returns kurtosis of trough
    limit = intersection(x, y, baseline)
    trough_mask = (x >= limit[0]) & (x <= limit[1])
    trough_data = y[trough_mask]
    k = kurtosis(trough_data, fisher=True)
    return k

In [3]:
def feature_extraction(t_df, c_df, out_df, n_d, n1, n2, n3, n4):
    #data extraction
    x = t_df['Time'].to_numpy()
    y_d = t_df[n_d].to_numpy()
    y_r_1 = t_df[n1].to_numpy()
    y_r_2 = t_df[n2].to_numpy()
    y_r_3 = t_df[n3].to_numpy()
    y_r_4 = t_df[n4].to_numpy()
    arrays_to_average = [y_r_1, y_r_2, y_r_3, y_r_4]
    y_r = np.mean(arrays_to_average, axis=0)
    
    #MAX DIFFERENCE & MAX DIFFERENCE TIMESTAMP
    y = (y_r - y_d)
    
    max_x = max_temp_difference_time(x, y)
    max_diff = max_temp_difference(y)
    
    #common code for both t* and 2t*
    t_df_6 = t_df.T
    c_df_filtered = c_df[(c_df['y'] < 4.5) & (c_df['z'] == 2.5) & (c_df['y'] > -4.5)]
    reference = c_df_filtered[(c_df_filtered['x'] < -20) | (c_df_filtered['x'] > 20)]
    r_x = reference['x'].to_numpy()
    r_id = reference['Node ID'].to_numpy()
    n_id = c_df_filtered['Node ID'].to_numpy()
    x_6 = c_df_filtered['x'].to_numpy()
    sorted_indices = np.argsort(x_6)
    x_6 = x_6[sorted_indices]
    
    #ANALYSIS @ t*
    temp_1 = []
    r_temp_1 = []
    
    for i in range(len(n_id)):
        temp_1.append(t_df.loc[3, str(n_id[i])])
    temp_1 = np.array(temp_1)
    
    for i in range(len(r_id)):
        r_temp_1.append(t_df.loc[3, str(r_id[i])])
    r_temp_1 = np.array(r_temp_1)
    r_value_1 = np.mean(r_temp_1)
    
    temp_1 = temp_1[sorted_indices]
    
    spl_1 = make_splrep(x_6, temp_1, s=0.1)
    x_smooth_1 = np.linspace(-50, 50, 100)
    temp_smoothed_1 = spl_1(x_smooth_1)
    
    fwhm_1 = calculate_fwhm(x_smooth_1, temp_smoothed_1, r_value_1)
    i1, d1_1, d2_1 = calculate_i_d(x_smooth_1, temp_smoothed_1, r_value_1, fwhm_1, spl_1)
    depth_1 = curve_depth(x_smooth_1, temp_smoothed_1, r_value_1)
    d_w_1 = d_w_ratio(x_smooth_1, temp_smoothed_1, r_value_1)
    kurt_1 = calculate_kurtosis(x_smooth_1, temp_smoothed_1, r_value_1)
    
    #ANALYSIS @ 2t*
    temp_2 = []
    r_temp_2 = []
    
    for i in range(len(n_id)):
        temp_2.append(t_df.loc[5, str(n_id[i])])
    temp_2 = np.array(temp_2)
    
    for i in range(len(r_id)):
        r_temp_2.append(t_df.loc[5, str(r_id[i])])
    r_temp_2 = np.array(r_temp_2)
    r_value_2 = np.mean(r_temp_2)
    
    temp_2 = temp_2[sorted_indices]
    
    spl_2 = make_splrep(x_6, temp_2, s=0.005)
    x_smoothed_2 = np.linspace(-50, 50, 100)
    temp_smoothed_2 = spl_2(x_smoothed_2)
    
    fwhm_2 = calculate_fwhm(x_smoothed_2, temp_smoothed_2, r_value_2)
    i2, d1_2, d2_2 = calculate_i_d(x_smoothed_2, temp_smoothed_2, r_value_2, fwhm_2, spl_2)
    depth_2 = curve_depth(x_smoothed_2, temp_smoothed_2, r_value_2)
    d_w_2 = d_w_ratio(x_smoothed_2, temp_smoothed_2, r_value_2)
    kurt_2 = calculate_kurtosis(x_smoothed_2, temp_smoothed_2, r_value_2)
    
    #APPLY TO DATAFRAME
    new_row = pandas.DataFrame([[max_diff, max_x, fwhm_1[0], fwhm_2[0],
                                 i1, i2, d1_1, d2_1, d2_1, d2_2, depth_1, 
                                 depth_2, d_w_1, d_w_2, kurt_1, kurt_2]], 
                               columns=['t_max', 't*', 'FWHM @ t*', 'FWHM @ 2t*', 
                                        'Intg @ t*', 'Intg @ 2t*', 'Derv_1 @ t*', 
                                        'Derv_2 @ t*', 'Derv_1 @ 2t*', 'Derv_2 @ 2t*', 
                                        'Depth at t*', 'Depth at 2t*', 'Depth/width at t*',
                                        'Depth/width at 2t*', 'Kurtosis at t*', 
                                        'Kurtosis at 2t*'])
    out_df = pandas.concat([out_df, new_row], ignore_index=True)
    return out_df

In [4]:
feature_extraction(t_df, c_df, out_df, '191', '142', '1714', '2563', '2566')

  out_df = pandas.concat([out_df, new_row], ignore_index=True)


Unnamed: 0,t_max,t*,FWHM @ t*,FWHM @ 2t*,Intg @ t*,Intg @ 2t*,Derv_1 @ t*,Derv_2 @ t*,Derv_1 @ 2t*,Derv_2 @ 2t*,Depth at t*,Depth at 2t*,Depth/width at t*,Depth/width at 2t*,Kurtosis at t*,Kurtosis at 2t*
0,0.75816,1.1,17.171717,15.151515,12.862717,4.222032,-0.0148603588692515,0.0204365716205486,0.0204365716205486,0.0007167949664321,0.737714,0.224266,0.042961,0.014802,-1.575709,-0.615702
