### References



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sklearn
import numpy as np
import pandas as pd
import h5py

TRAIN FILE

In [7]:
#Reading the h5 emotion train file and storing it in a 
#variable called dataMosei_train
#The code also prints it type
dataMosei_train_ey = h5py.File("/content/drive/MyDrive/Dissertation/dataset/data/ey_train.h5", "r")
print(type(dataMosei_train_ey))

<class 'h5py._hl.files.File'>


In [8]:
#Since h5 files act like a dictionary, hence we need to access the keys
#Creating an empty list to store the keys in a list
ey_trainKeys = []



#This for loop will iterate through the object keys of the h5 file,
#and store it in the empty list created above for this
#This is done so that it will be easier to interpret which keys are present in the h5 file
for key in dataMosei_train_ey.keys():
    ey_trainKeys.append(key)

print(ey_trainKeys)

['d1']


In [11]:
#Since we want to store the data in a CSV file, we first covert it to a DataFrame
#Initially, we convert the h5 file to a NumPy array, as it will be easier to build the pandas DataFrame
df_eyTrain = pd.DataFrame(np.array(dataMosei_train_ey.get("d1")))          #[1]

display(df_eyTrain.head())

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.333333,0.333333,0.333333,0.0,0.0
1,0.333333,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Printing the column values of the newly created DataFrame
print(df_eyTrain.columns)

RangeIndex(start=0, stop=6, step=1)


In [13]:
#As per [5], the emotions in the ey_train, ey_test, and ey_valid h5 files are in the order -
#["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]
#Hence, we need to rename the columns as per the above provided order
df_eyTrain.columns = ["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]     #[2]

#Printing the updated column values
print(df_eyTrain.columns)

Index(['Anger', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise'], dtype='object')


In [15]:
display(df_eyTrain.head())

Unnamed: 0,Anger,Disgust,Fear,Happy,Sad,Surprise
0,0.0,0.333333,0.333333,0.333333,0.0,0.0
1,0.333333,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#Print out the min and max of each column
print(df_eyTrain.agg([min, max]))            #[6]

     Anger  Disgust      Fear     Happy  Sad  Surprise
min    0.0      0.0  0.000000  0.000000  0.0       0.0
max    3.0      3.0  1.666667  3.333333  3.0       3.0


In [17]:
#The minimum value an emotion can have in the dataset is 0.0
#and the maximum value it can have is 3.0
#0.0 value indicates that the particular emotion is not present
#3.0 value indicates that there is a high presence of that particular emotion

#From the output of (df_eyTrain.agg([min, max])), we can see that the emotion
#"Happy", has a maximum value of 3.333333, however the maximum value any emotion
#can have is 3.0. Hence, we need to change the maximum value of "Happy" from 3.333333
#to 3.0

#This below line of code, checks which values of "Happy" are greater than 3,
#if the boolean condition comes true, then change the value to 3
df_eyTrain.loc[df_eyTrain["Happy"] > 3, "Happy"] = 3           #[4]

In [19]:
#Print out the min and max of each column again
print(df_eyTrain.agg([min, max]))

print(" ")

#Printing out the number of rows and columns in the DataFrame
print(df_eyTrain.shape)

     Anger  Disgust      Fear  Happy  Sad  Surprise
min    0.0      0.0  0.000000    0.0  0.0       0.0
max    3.0      3.0  1.666667    3.0  3.0       3.0
 
(15290, 6)


In [20]:
#As mentioned previously, the maximum value an emotion can have is 3.0 (indicates a high presence of that emotion) 
#and the minimum value the emotion can have is 0.0 (indicates the emotion is not present)

#The DataFrame has 6 columns of the order – “Anger”, “Disgust”, “Fear”, “Happy”, “Sad”, “Surprise”
#Under each column, a value between [0,3] is written, indicating the presence level of the emotion

#To ensure the ML model detects the right emotion, a list called “emoListTrain” is initialized to store the 
#column names (emotions) which have the maximum value for each row. The highest value in a row will indicate 
#a strong presence of that emotion.

#This below line of code uses the idxmax() function to find the column name with the maximum value
#The result (column name) is then stored in a list called emoListTrain
emoListTrain = list(df_eyTrain.idxmax(axis=1))     #[10]


for row, emo in zip(range(15290), emoListTrain):       #[11]
    df_eyTrain.loc[row, emo] = 1           #[7]
    df_eyTrain.loc[row, df_eyTrain.columns != emo] = 0       #[8]  #[9]

df_eyTrain.to_csv('/content/drive/MyDrive/Dissertation/dataset/csv_files/eTrain.csv', index = False)

print("ey_train.h5 converted to CSV")

ey_train.h5 converted to CSV


TEST FILE

In [21]:
dataMosei_test_ey = h5py.File("/content/drive/MyDrive/Dissertation/dataset/data/ey_test.h5", "r")

In [22]:
#Since h5 files act like a dictionary, hence we need to access the keys
#Creating an empty list to store the keys in a list
ey_testKeys = []

#This for loop will iterate through the object keys of the h5 file,
#and store it in the empty list created above for this
#This is done so that it will be easier to interpret which keys are present in the h5 file
for key in dataMosei_test_ey.keys():
    ey_testKeys.append(key)

print(ey_testKeys)

['d1']


In [25]:
#Since we want to store the data in a CSV file, we first covert it to a DataFrame
#Initially, we convert the h5 file to a NumPy array, as it will be easier to build the pandas DataFrame
df_eyTest = pd.DataFrame(np.array(dataMosei_test_ey.get("d1")))

display(df_eyTest.head())

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.666667,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.666667,0.0,0.0


In [26]:
#As per [5], the emotions in the ey_train, ey_test, and ey_valid h5 files are in the order -
#["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]
#Hence, we need to rename the columns as per the above provided order
df_eyTest.columns = ["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]

display(df_eyTest.head())

Unnamed: 0,Anger,Disgust,Fear,Happy,Sad,Surprise
0,0.0,0.0,0.0,0.666667,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.666667,0.0,0.0


In [27]:
#Print out the min and max of each column
print(df_eyTest.agg([min, max]))

        Anger  Disgust  Fear     Happy       Sad  Surprise
min  0.000000      0.0   0.0  0.000000  0.000000       0.0
max  2.333333      3.0   1.0  4.333333  2.666667       2.0


In [28]:
#The minimum value an emotion can have in the dataset is 0.0
#and the maximum value it can have is 3.0
#0.0 value indicates that the particular emotion is not present
#3.0 value indicates that there is a high presence of that particular emotion

#From the output of (df_eyTrain.agg([min, max])), we can see that the emotion
#"Happy", has a maximum value of 3.333333, however the maximum value any emotion
#can have is 3.0. Hence, we need to change the maximum value of "Happy" from 3.333333
#to 3.0

#This below line of code, checks which values of "Happy" are greater than 3,
#if the boolean condition comes true, then change the value to 3
df_eyTest.loc[df_eyTest["Happy"] > 3, "Happy"] = 3

In [29]:
#Print out the min and max of each column again
print(df_eyTest.agg([min, max]))

print(" ")

#Printing out the number of rows and columns in the DataFrame
print(df_eyTest.shape)

        Anger  Disgust  Fear  Happy       Sad  Surprise
min  0.000000      0.0   0.0    0.0  0.000000       0.0
max  2.333333      3.0   1.0    3.0  2.666667       2.0
 
(4832, 6)


In [30]:
#As mentioned previously, the maximum value an emotion can have is 3.0 (indicates a high presence of that emotion) 
#and the minimum value the emotion can have is 0.0 (indicates the emotion is not present)

#The DataFrame has 6 columns of the order – “Anger”, “Disgust”, “Fear”, “Happy”, “Sad”, “Surprise”
#Under each column, a value between [0,3] is written, indicating the presence level of the emotion

#To ensure the ML model detects the right emotion, a list called “emoListTest” is initialized to store the 
#column names (emotions) which have the maximum value for each row. The highest value in a row will indicate 
#a strong presence of that emotion.

#This below line of code uses the idxmax() function to find the column name with the maximum value
#The result (column name) is then stored in a list called emoListTest
emoListTest = list(df_eyTest.idxmax(axis=1))

for row, emo in zip(range(4832), emoListTest):       #[11]
    df_eyTest.loc[row, emo] = 1           #[7]
    df_eyTest.loc[row, df_eyTest.columns != emo] = 0

df_eyTest.to_csv('/content/drive/MyDrive/Dissertation/dataset/csv_files/eTest.csv', index = False)

print("ey_test.h5 converted to CSV")

ey_test.h5 converted to CSV


VALID FILE

In [31]:
dataMosei_valid_ey = h5py.File("/content/drive/MyDrive/Dissertation/dataset/data/ey_valid.h5", "r")

In [32]:
#Since h5 files act like a dictionary, hence we need to access the keys
#Creating an empty list to store the keys in a list
ey_validKeys = []

#This for loop will iterate through the object keys of the h5 file,
#and store it in the empty list created above for this
#This is done so that it will be easier to interpret which keys are present in the h5 file
for key in dataMosei_valid_ey.keys():
    ey_validKeys.append(key)

print(ey_validKeys)

['d1']


In [33]:
#Since we want to store the data in a CSV file, we first covert it to a DataFrame
#Initially, we convert the h5 file to a NumPy array, as it will be easier to build the pandas DataFrame
df_eyValid = pd.DataFrame(np.array(dataMosei_valid_ey.get("d1")))

display(df_eyValid.head())

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.333333,0.0,0.0
1,0.0,0.0,0.0,0.0,0.333333,0.0
2,0.0,0.0,0.0,0.666667,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,2.666667,0.0,0.0


In [34]:
#As per [5], the emotions in the ey_train, ey_test, and ey_valid h5 files are in the order -
#["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]
#Hence, we need to rename the columns as per the above provided order
df_eyValid.columns = ["Anger", "Disgust", "Fear", "Happy", "Sad", "Surprise"]

display(df_eyValid.head())

Unnamed: 0,Anger,Disgust,Fear,Happy,Sad,Surprise
0,0.0,0.0,0.0,0.333333,0.0,0.0
1,0.0,0.0,0.0,0.0,0.333333,0.0
2,0.0,0.0,0.0,0.666667,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,2.666667,0.0,0.0


In [35]:
#Print out the min and max of each column
print(df_eyValid.agg([min, max]))

print(" ")

#Printing out the number of rows and columns in the DataFrame
print(df_eyValid.shape)

        Anger   Disgust      Fear  Happy  Sad  Surprise
min  0.000000  0.000000  0.000000    0.0  0.0  0.000000
max  2.666667  2.666667  1.666667    3.0  3.0  2.333333
 
(2291, 6)


In [36]:
#As mentioned previously, the maximum value an emotion can have is 3.0 (indicates a high presence of that emotion) 
#and the minimum value the emotion can have is 0.0 (indicates the emotion is not present)

#The DataFrame has 6 columns of the order – “Anger”, “Disgust”, “Fear”, “Happy”, “Sad”, “Surprise”
#Under each column, a value between [0,3] is written, indicating the presence level of the emotion

#To ensure the ML model detects the right emotion, a list called “emoListValid” is initialized to store the 
#column names (emotions) which have the maximum value for each row. The highest value in a row will indicate 
#a strong presence of that emotion.

#This below line of code uses the idxmax() function to find the column name with the maximum value
#The result (column name) is then stored in a list called emoListValid
emoListValid = list(df_eyValid.idxmax(axis=1))

for row, emo in zip(range(2291), emoListValid):
    df_eyValid.loc[row, emo] = 1
    df_eyValid.loc[row, df_eyValid.columns != emo] = 0

df_eyValid.to_csv("/content/drive/MyDrive/Dissertation/dataset/csv_files/eValid.csv", index = False)

print("ey_valid.h5 converted to CSV")

ey_valid.h5 converted to CSV
