# CREATE A DATA SET

This program organize the data retrieved by the different sensors. As each sensor creates its own file, it is necessary to merge all the data in one only file so there only exists one dataset per experiment.


- **HR.txt:**  this file contains the data retrieved from the Intel RealSense camera.
- **BR.csv:**  this file contains the data retrieved from the Xethru X2m200 radar.
- **_date_.json:**  this file contains the data retrieved from the Windows Emotion API corresponding to one picture. There is one of this for each picture taken during the test.

Each of these files has, for each row, i.e., for each sample, a timestamp. The different measurements are joined according to the timestamp. Also, each sample is labeled according to the moment where it was taken. There is one label for each block of the test:

- ['pre',1,2,3,4,5,6,'post']

#### _Needed packages_

In [5]:
import datetime
import glob, os
import json
from collections import OrderedDict
import csv
import pandas as pd
import numpy as np

#### _Functions: parsing the datafiles_


_Functions to copy the data in the wanted format_

- **HR(file):**

    Its input is a .txt file with the data in format:
        hr;timestamp
    where:
        hr: heart rate
        timestamp: secs since 1st January 1970
    
    outputs a file:
        date;HR
        AAAAMMDDHHMMSS;hr

As the frequency in which the heart rate is higher than the rest, the samples corresponding to the same second are averaged.
   
        
- **BR(file):**

    Its input is a .csv file with the data in format:
    
        TimeStamp;State;RPM;ObjectDistance;ObjectMovement;SignalQuality
        
    where only TimeStamp and RPM are interesting for this project.
        
        - TimeStamp: time at the moment when the sample was taken. It has this format: 2017-04-10T16:17:20.956
        - RPM: breathing rate
        
    output file:
        date;BR
        AAAAMMDDHHMMSS;br

    
    
- **emotions(file):**
    
    Its input is a .json file containing the scores for each emotion. This function extracts the scores corresponding to these emotions. The name of the .json file is the timestamp already in the common format. The program takes the name of the file to write the timestamp.
    
    output file:
        date;anger;contempt;disgust;fear;happiness;neutral;sadness;urprise

    
---------------------------------------------------------------------------------------------------------------------
For the three of them the input is a file to analyze. The function writes the content of the file to a .csv with the date written in the wanted format:

      date: common format for the date AAAAMMDDHHMMSS

In [2]:
def HR(file,folder):
    with open(file) as current_file:
        for line in current_file:
            #From the .txt file where the data is stored as hr;date, split both data in two different str.
            #Convert the date to the common format
            hr, date = line.split(";", 1);                           
            date = datetime.datetime.fromtimestamp(int(date)).strftime('%Y%m%d%H%M%S')
            #Discard the rows where the camera was not able to detect the pulse.
            if (not line.startswith("0")) and (not line.startswith("-1")):                
                #Write the formatted information to a new .csv file: date(common format);hr.
                HR_file = open(folder+'/HR.csv',"a");
                HR_file.write(date + ";" + hr + '\n');
                HR_file.close();
            else:
                HR_file = open(folder+'/HR.csv',"a");
                HR_file.write(date + ";" + '0' + '\n');
                HR_file.close();   
                
    avg_duplicates_HR('/HR.csv',folder)
                 

def BR(file,folder):
    with open(file) as current_file:
        for line in current_file:
            #The first lines of this file are comments not useful for the dataset, so they are discarded.
            if (line[0].isdigit()):
                #Each line has a 6 columns with different data. For the dataset only the date and the BR are needed.
                data = line.split(";",5);                 
                #Retrieve the date and give it the common format
                time= data[0];
                time = time.replace("-","");
                time = time.replace("T", "");
                time = time.replace(":","");
                time, rest = time.split(".",1)
                #Retrieve the rpm                                
                rpm= data[2];
                #Filter the data: if no respiration was detected, the row is discarded.
                #if rpm!='0':
                 #   BR_file = open(folder+'/BR.csv',"a");
                 #   BR_file.write(time + ";" + rpm + '\n');
                 #   BR_file.close();
                #rpm is a string                   


def emotions(file,folder):
    #The date is retrieved from the file's name and it is compared as a string to the vector of times c. The label
    #is then calculated
    date=os.path.splitext(file)[0];
    date = int(float(date));
#    lbl=label(c,date);
    
    with open(file) as json_file:
        
        data = json.load(json_file);
        mstr = ''.join(map(str, data));
        #if the Windows API fails to detect emotions, the file is discarded. When it fails to detect emotions it
        #sends a file with the string "[]" in it.
        if (mstr.startswith("{u'faceRectangle")):
            emotions=data[0]['scores'];
            #scores is a list
            scores = emotions.values();
            scores.insert(0, date);
#            scores.insert(len(scores),lbl)
            out = csv.writer(open(folder+'/emotions.csv','ab'), delimiter=';')
            out.writerow(scores);
        
        else:
            scores = [0] * 8
            scores.insert(0, date);
#            scores.insert(len(scores),lbl)
            out = csv.writer(open(folder+'/emotions.csv','ab'), delimiter=';')
            out.writerow(scores);

#### _Functions: auxiliar functions_

- **c_file(file):** this function retrieves the data from the times of the stroop test. This timestamps will be used to label the data.



- **openFile_writeHeader(folder):** open three new files in the folder specified as input with the correspondant headers for the data. The folder will be by default created in 'c:/users/Anana/Desktop/Ana/sensores/Data/'. The name of the files are also set by default as:


    - HR.cvs
    - BR.cvs
    - emotions.cvs
    

- **load_cvs(folder):** loads the three files with the data already stored in the right format. 

In [1]:
def c_file(folder):
    c=[0,0,0,0,0,0,0,0];
    #The time file from  the stroop test is loaded.
    with open(folder+'/c.txt') as c_f:
        i = 0;
        #The time is stored as a cell of strings. It is needed to give the common format to each time stored.
        for line in c_f:
            
            a = line + '\n';
            a = a.replace(" ","");
            a = a.replace(";","");
            a = a[:-1];
            c[i]=int(a);
            
            i=i+1;  
    return c;


def label(c,date):
    
    if date<c[0]:
        label='pre';
    elif c[0]<=date<c[1]:
        label='1';
    elif c[1]<=date<c[2]:
        label='2';
    elif c[2]<=date<c[3]:
        label='3';
    elif c[3]<=date<c[4]:
        label='4';
    elif c[4]<=date<c[5]:
        label='5';
    elif c[5]<=date<c[6]:
        label='6';
    elif c[6]<=date:
        label='post';
   
    return label
    
    
def openFile_writeHeader(folder):
    with open(folder+'/HR.csv', "wb") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = ["date", "HR"], delimiter = ';')
        writer.writeheader()
    with open(folder+'/BR.csv', "wb") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = ["date", "BR"], delimiter = ';')
        writer.writeheader()
    with open(folder+'/emotions.csv', "wb") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = ["date", "sadness", "neutral", "contempt", "disgust", "anger",
                                                       "surprise", "fear", "happiness"], delimiter = ';')
        writer.writeheader()
        
def load_csv(folder):
    
    first = pd.read_csv(folder+'/HR.csv', delimiter = ',');
    second = pd.read_csv(folder+'/BR.csv', delimiter = ';');
    third = pd.read_csv(folder+'/emotions.csv', delimiter = ';');
    
    return first, second, third


##### Combine samples with same dates by averaging the value of HR.

In [5]:
def avg_duplicates_HR(file,folder):
    df = pd.read_csv(folder+file, delimiter = ';');
    grouped = df.groupby('date')['HR'].mean().reset_index()
    grouped.to_csv(folder+file,index=False)

#### _Function: main function_

This function is the main function:

1. Creates 3 .csv files in the folder specified (c:/users/Anana/Desktop/Ana/sensores/Data/'+folder), one for each sensor.


2. Reads every file on the specifiec folder and, depending on if they are emotion, HR or BR (the classification is made either by exention or by name) it calls one of the three functions to parse each of them.


3. Reloads the 3 .csv files and merges them.

In [6]:
def create_DataSet(folder, ident,id):
#    folder='c:/users/Anana/Desktop/Ana/sensores/Data/SSST/abhiSSST'
    #create the three initial files where to copy all the sensor info
    openFile_writeHeader(folder);
    #get the times of the test and format them
    c=c_file(folder);
    
    
    #--------------------------------------------------------------
    os.chdir(folder)
    #for each file in the folder where all data from a subject are stored, format it.
    for file in glob.glob("*.*"):
        #Separate name and extension, as according to them they will be treated in a different way.
        ext = os.path.splitext(file)[-1].lower();
        name = os.path.splitext(file)[0];
    
        if name.startswith('xethru'):
            BR(file,folder);
            #print 1;
    
    
        elif (ext == ".txt") and (name!='c'):
            HR(file,folder);
            #print 1;
    
    
        elif ext == ".json":
            emotions(file,folder);
            #print 1;
    #--------------------------------------------------------------
    #Load the three files as csv.
    
    first, second, third=load_csv(folder);
    #--------------------------------------------------------------
    #Merge the files and delete the rows without data.
    
    merged = pd.merge(first, second, how='left', on='date')
    #merged.replace(["NaN"], np.nan, inplace = True)
    merged.fillna(0)
    #merged = merged.dropna()
    merged.to_csv(folder+'/dataSet.csv', index=False)
    
    merged = pd.read_csv(folder+'/dataSet.csv', delimiter = ',');
    merged = pd.merge(merged, third, how='left', on='date')
    #merged.replace(["NaN"], np.nan, inplace = True)
    merged.fillna(0)
    #merged = merged.dropna()
     
    labels = []
    s = merged['date']
    
    for j in range(0,len(s)):
        date = s[j]
        labels.append(label(c,date))
    
        
    merged['label'] = labels
    merged['id']= str(id)
    merged['subject'] = ident
    merged.to_csv('dataSet.csv', index=False)

#    df = pd.read_csv(folder + 'dataSet.csv')



#### Join the new data to the main dataset with all of the data

In [8]:
def join_main_ds(file_ds,new_ds):
    #Abrir el ds que ya existe    
    main_ds = open(file_ds, 'a')
    #Para cada línea del nuevo DS append la linea al anterior.
    with open(new_ds) as new:
        for line in new:
            if (line[0].isdigit()):
                main_ds.write(line);
    #cerrarlos los dos.
    main_ds.close();

## Main Program

### Here the function 'create_DataSet' is called

In [10]:
dir_data = 'c:/users/Anana/Desktop/Ana/sensores/Data/SSST/zHECHOS/'

a = get_immediate_subdirectories(dir_data)
i = 0;
for name in a:
    if(not name.startswith("z")):
        create_DataSet(dir_data+name,name,i);
        join_main_ds(dir_data+'main_dataSet_SSST_zeros.csv',dir_data+name+'/dataSet.csv');
        i= i+1;

NOT FILLING UP WITH ZEROS ANYMORE. INTERPOLATION COMES LATER

In [122]:
file = 'c:/users/Anana/Desktop/Ana/sensores/data/SSST/main_dataSet_SSST.csv'

df = pd.read_csv(file, delimiter = ',');
df_aux = df.fillna(value=0, method=None, inplace=False)
df_aux.to_csv('c:/users/Anana/Desktop/Ana/sensores/data/SSST/main_dataSet_SSST.csv', index=False)

In [22]:
dir_data = 'c:/users/Anana/Desktop/Ana/sensores/Data/SSST/'

a = get_immediate_subdirectories(dir_data)
i = 0;
for name in a:
    if(not name.startswith("z")):
        create_DataSet(dir_data+name,name,i);
        join_main_ds(dir_data+'main_dataSet_SSST.csv',dir_data+name+'/dataSet.csv');
        i= i+1;

# PRE-PROCESSING THE DATA

The dataset created is loaded. Then different processing tasks are held:

 - Take the data corresponding to the calm and stressed part. After analyzing all of the blocks of the test, the 2nd block is the one with the most changes compared to the stress part, so it is the one chosen
 - Emotions: label as 1 the emotion with the highest score and as 0 the rest of them
 - Heart Rate and Breating rate: normalize the data
 - Add new features:
 
        - delta: substracts the mean value of the feature when the person is calm to the value
        - variability: difference between a sample and the previous one

In [None]:
file_path = 'c:/users/Anana/Desktop/Ana/sensores/Data/SSST/main_dataSet_SSST_zeros.csv'
dataframe = pd.read_csv(file_path, delimiter=',')

#### Take the labeled data.

In [None]:
df = pd.DataFrame()
aux1 = dataframe.loc[dataframe['label']=='2']
aux2 = dataframe.loc[dataframe['label']=='6']
df = df.append(aux1)
df = df.append(aux2)

#### Emotions feature transformation

In [None]:
#takes only the emotions
emo = df[['sadness', 'neutral', 'contempt', 'disgust', 'anger', 'surprise', 'fear', 'happiness']]
emo_aux = emo.values
#array with the index of the highest emotion value
emo_max = emo_aux.argmax(axis=1)
#create an array with all zeros
emo_todo = np.zeros_like(emo_aux)

#put a 1 where the highest emotion
for row,value in enumerate(emo_max):
    emo_todo[row,value]=1

#add the columns to the dataset
df_emo = pd.DataFrame(emo_todo)
df[['sad', 'neu', 'con', 'dis', 'ang', 'sur', 'fea', 'hap']]=df_emo

#### Normalize BR and HR

In [None]:
tamano = len(df)
max_listHR = []
min_listHR = []
max_listBR = []
min_listBR = []

arrayHR = np.zeros(tamano)
arrayBR = np.zeros(tamano)

posicion = 0

#for each person take the min and the max values of their calm part, for BR and HR
for id in df.id.unique():
    df_id = df.loc[df['id']==id]
    df_rest = df_id.loc[df['label']=='2']

    max_calm_HR = df_rest['HR'].max()
    max_listHR.append(max_calm_HR)
    
    min_calm_HR = df_rest['HR'].min()
    min_listHR.append(min_calm_HR)
    
    max_calm_BR = df_rest['BR'].max()
    max_listBR.append(max_calm_BR)
    
    min_calm_BR = df_rest['BR'].min()
    min_listBR.append(min_calm_BR)


#this has to be ordered by labels as the dataset structure is by labels and no by ids.
for label in df.label.unique():
    df_label = df.loc[df['label']==label]
    
    for i,id in enumerate(df.id.unique()):
        df_id = df_label.loc[df['id']==id]
        c = df_id['HR'].values
        #print(c)
        max_calm_HR = max_listHR[i]
        min_calm_HR = min_listHR[i]
        deltaHR = max_calm_HR-min_calm_HR
        arrayHR[posicion:(len(df_id)+posicion)]=(c-min_calm_HR)/(deltaHR)

        d = df_id['BR'].values
        print(d)
        max_calm_BR = max_listBR[i]
        min_calm_BR = min_listBR[i]
        deltaBR = max_calm_BR-min_calm_BR
        print (deltaBR)
        arrayBR[posicion:(len(df_id)+posicion)]=(d-min_calm_BR)/(deltaBR)
        print (arrayBR)

        posicion = len(df_id) + posicion
        #print(deltaHR)

df['HRbias']=arrayHR
df['BRbias']=arrayBR

#### Adding the new features

In [None]:
def add_RV(df, nam):
        
    #new list
    delta = []
    
    #for each person
    for j in df.id.unique():
        #take only that person's data
        aux = df.loc[df['id'] == j]
        #put it as an array
        aux_arr = np.asarray(aux[nam])
        init = aux_arr[0]
        delta_j = np.zeros((len(aux_arr)))
        for y in range(0,len(aux_arr)):
            delta_j[y] = aux_arr[y]-init
            init = aux_arr[y]
        delta = delta + delta_j.tolist()

    new_column = nam+'V'  
    df[new_column] = delta

    return df

df = add_RV(df,'HRbias')
df = add_RV(df,'BRbias')

def add_delta(df, nam):
    #Declare new list delta, which will be the new column of the dataset
    delta = []
    #a: groub the column which delta we are calculating by label and id and we perform the mean
    a = df[nam].groupby([df['label'], df['id']]).mean().unstack()
    #for every person in the dataset
    for j in list(a):
        #we take the mean of the pre State so we have a reference
        HR_pre = a[j][0]
        #we take the subset corresponding to the person
        aux = df.loc[df['id'] == j]
        #as an array
        df_arr = np.asarray(aux[nam])
        #we substract so we have the delta
        delta_j = df_arr-HR_pre
        #we add it to the new column
        delta = delta + delta_j.tolist()
    #the new column name
    new_column = 'delta' + nam
    #put the new column to the dataframe
    df[new_column] = delta
    return df

df = add_delta(df,'HRbias')
df = add_delta(df,'BRbias')

#### Save de dataframe to a csv file

In [None]:
df.to_csv('c:/users/Anana/Desktop/Ana/sensores/data/SSST/work_main_complete_zeros.csv',index = False)

In [8]:
df = pd.read_csv('c:/users/Anana/Desktop/Ana/sensores/data/SSST/work_main_complete_zeros.csv', delimiter=',')
df

Unnamed: 0,id,label,HR,BR,sadness,neutral,contempt,disgust,anger,surprise,...,con,dis,ang,sur,fea,hap,HRbiasV,BRbiasV,deltaHRbias,deltaBRbias
0,0,2,56.575800,24.0,4.391070e-03,9.465286e-01,3.752844e-02,1.016570e-04,9.620000e-05,4.670000e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,-0.426074,0.678571
1,0,2,55.020467,23.0,1.915034e-03,8.085518e-01,1.631041e-01,1.865650e-04,3.070870e-04,1.797370e-04,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.029994,-0.083333,-0.456068,0.595238
2,0,2,83.097700,12.0,1.745249e-03,9.503377e-01,3.192503e-02,8.510000e-05,1.444720e-04,1.905270e-04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.541461,-0.916667,0.085393,-0.321429
3,0,2,67.000000,13.0,3.007700e-04,9.614010e-01,2.724849e-02,1.110000e-05,3.180000e-05,2.700000e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.310440,0.083333,-0.225046,-0.238095
4,0,2,91.500000,13.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.472476,0.000000,0.247429,-0.238095
5,0,2,90.618733,13.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.016995,0.000000,0.230434,-0.238095
6,0,2,106.875000,13.0,1.061392e-03,9.599540e-01,2.837119e-02,2.850000e-05,5.330000e-05,5.440000e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.313498,0.000000,0.543932,-0.238095
7,1,2,75.732733,13.0,2.675438e-02,9.718252e-01,1.036257e-03,1.460000e-05,2.670000e-05,8.450000e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.886786,0.583333,-0.342855,0.345238
8,1,2,70.614833,12.0,1.894785e-02,9.791439e-01,1.330475e-03,3.080000e-05,7.150000e-05,1.920080e-04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.104860,0.000000,-0.237995,0.345238
9,1,2,88.040900,13.0,3.538721e-03,9.962018e-01,1.644520e-04,2.370000e-06,8.150000e-06,6.250000e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.032395,0.000000,-0.205600,0.345238
