In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
%matplotlib inline

# annoying warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [5]:
def filtering_features_pipeline(data, target_file, temp_increase_vars):
    '''
    Function to run all of our preprocessing functionalities, with a few different configuration options.
    Will save a new dataframe as csv file in a specified address
    
    INPUTS:
    data: a pandas dataframe containing the raw data
    target_file: a file name and address to save the preprocessed data frames. expects .csv extension
    temp_increase_vars: a boolean value (True or False) on whether or not to create the binary temperature increase variables
    
    OUTPUTS:
    data: a data frame with the preprocessed data
    file saved... the preprocessed data frame saved as a csv file at the specified file extension
    '''
    
    dates = data['Date.Time']
    data.drop(['Unnamed: 0', 'Date.Time'], axis = 1, inplace = True)
    
    data.drop(['Pressure'], axis = 1, inplace = True)
    
    #delete rows where Main_Mass_Flow =0
    rows_to_delete = np.where(data.Main_Mass_Flow == 0)[0]
    # also add 337, 2533
    rows_to_delete = np.sort(np.concatenate((rows_to_delete, np.array([337, 2533])), axis = 0))

    data.drop(rows_to_delete, axis = 0, inplace = True)
    
    # replace T_Zone_1 zero vals with median
    new_zero_val_T = np.median(data.T_Zone_1.iloc[np.where(data.T_Zone_1 > 50)])
    data.T_Zone_1.iloc[np.where(data.T_Zone_1 <= 50)] = new_zero_val_T

    # replace Blending vals with median
    new_zero_val_B = np.median(data.Blending.iloc[np.where(data.Blending > 20)])
    data.Blending.iloc[np.where(data.Blending <= 20)] = new_zero_val_B
    
    data = data[data.Main_Mass_Flow > 15000]
    
    if temp_increase_vars:
        for i in range(1,10):
            new_colname = "T_Increase_" + str(i)
            colname_i = "T_Zone_" + str(i)
            colname_inext = "T_Zone_" + str(i+1)
            vals = (data[colname_i] <= data[colname_inext]).astype(int)
            data[new_colname] = vals
   
    data.to_csv(target_file)
    return data

        
    


In [6]:
def scale_training(data, scaler):
    '''
    A function that will peform scaling on training data given that all features are already created.
    
    INPUTS:
    data: a pandas dataframe to be scaled
    scaler: a scaling protocol: "standard" or "minmax"
    
    RETURNS:
    scaler: a scaler object that can be used to scale testing data
    scaled_features: a scaled pandas dataframe object
    '''
    if scaling == "standard":
        scaled_colnames = data.columns
        scaled_features = data.copy()
        features = scaled_features[scaled_colnames]
        scaler = StandardScaler().fit(features.values)
        features = scaler.transform(features.values)
        scaled_features[scaled_colnames] = features
        
    elif scaling == "minmax":
        scaled_colnames = data.columns
        scaled_features = data.copy()
        features = scaled_features[scaled_colnames]
        scaler = MinMaxScaler().fit(features.values)
        features = scaler.transform(features.values)
        scaled_features[scaled_colnames] = features
    
    else:
        raise ValueError(scaler)
        
    return scaler, scaled_features

In [7]:
data = pd.read_csv("../../../datasets/anonymized_SAP_data.csv")


In [8]:
filtering_features_pipeline(data, './testcsv/with_tempinc.csv', True)

Unnamed: 0,Main_Mass_Flow,Additive_1_Ratio,Additive_2_Ratio,Additive_3_Ratio,Additive_4_Ratio,Additive_5_Ratio,Additive_6_Ratio,Flow_Gas_Ratio,T_Zone_1,T_Zone_2,...,Quality,T_Increase_1,T_Increase_2,T_Increase_3,T_Increase_4,T_Increase_5,T_Increase_6,T_Increase_7,T_Increase_8,T_Increase_9
0,17947.958984,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.534770,200.130234,187.366165,...,45.0750,0,1,0,1,1,0,1,1,0
1,17942.625000,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.524599,200.140137,187.729889,...,44.6825,0,1,0,1,1,0,1,1,0
2,17955.152344,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.528997,199.947128,187.754242,...,44.2900,0,1,0,1,1,0,1,1,0
3,17965.117188,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.519700,199.723190,187.727036,...,44.4275,0,1,0,0,1,0,1,1,0
4,17949.132812,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.525926,199.627060,187.351547,...,44.5650,0,1,0,0,1,0,1,1,0
5,17958.937500,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.521689,199.841858,187.415497,...,44.7025,0,1,0,1,1,0,1,1,0
6,17957.164062,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.510841,199.586136,187.919907,...,44.8400,0,1,0,1,1,0,1,1,0
7,17964.480469,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.503754,199.705261,187.216400,...,44.6050,0,1,0,1,1,0,1,1,0
8,17945.482422,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.526086,199.523193,187.685547,...,44.3700,0,1,0,1,1,0,1,1,0
9,17957.333984,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.524996,198.965714,187.423965,...,44.1350,0,1,0,1,1,0,1,1,0


In [10]:
data = pd.read_csv("../../../datasets/anonymized_SAP_data.csv")
filtering_features_pipeline(data, './testcsv/no_tempinc.csv', False)

Unnamed: 0,Main_Mass_Flow,Additive_1_Ratio,Additive_2_Ratio,Additive_3_Ratio,Additive_4_Ratio,Additive_5_Ratio,Additive_6_Ratio,Flow_Gas_Ratio,T_Zone_1,T_Zone_2,T_Zone_3,T_Zone_4,T_Zone_5,T_Zone_6,T_Zone_7,T_Zone_8,T_Zone_9,T_Zone_10,Blending,Quality
0,17947.958984,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.534770,200.130234,187.366165,197.865295,175.733353,177.881012,180.187149,175.288040,181.461487,184.349884,178.632538,24.0,45.0750
1,17942.625000,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.524599,200.140137,187.729889,198.242508,175.395889,176.092880,178.021576,173.222076,180.226273,183.611710,177.633362,24.0,44.6825
2,17955.152344,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.528997,199.947128,187.754242,197.993622,175.388733,176.331268,178.255264,173.343567,179.927109,183.406296,177.319366,24.0,44.2900
3,17965.117188,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.519700,199.723190,187.727036,198.104874,175.564926,175.403198,177.945908,172.337830,178.773697,182.526581,176.517807,24.0,44.4275
4,17949.132812,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.525926,199.627060,187.351547,197.611877,174.955109,174.076019,176.105621,170.777313,177.310883,181.188675,175.847260,24.0,44.5650
5,17958.937500,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.521689,199.841858,187.415497,197.982666,175.351517,177.013184,179.443481,173.695526,180.406509,183.772873,177.445740,24.0,44.7025
6,17957.164062,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.510841,199.586136,187.919907,197.968781,175.379974,175.603760,177.872345,172.558044,179.445633,182.891022,176.639999,24.0,44.8400
7,17964.480469,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.503754,199.705261,187.216400,197.755157,175.412949,177.414581,179.766891,174.333267,180.639801,183.522079,177.869278,24.0,44.6050
8,17945.482422,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.526086,199.523193,187.685547,198.015335,175.507095,176.613068,178.894928,173.799133,180.573303,183.926483,177.449921,24.0,44.3700
9,17957.333984,0.000626,0.053146,0.000876,0.001,0.000910,0.00525,2.524996,198.965714,187.423965,197.850113,175.680801,176.342468,178.755920,173.923035,180.565399,183.530136,177.962219,24.0,44.1350
