In [49]:
import tensorflow as tf
import pandas as pd
from pathlib import Path
from sklearn.metrics import cohen_kappa_score, accuracy_score
import pingouin as pg

In [50]:
# Load the data
data_table = pd.read_csv('../../DataFolder/ProsodyData/data_ratings_final.csv')

In [51]:
# First Map rename filename in DS from xxx.wav to xxx.tfs
data_table['filename'] = data_table['filename'].apply(lambda x: x.replace('.wav', '.tfs'))

In [52]:
#iterative through the filenames and check if related file exists the filename is name.wav related file is name.tfs
data_folder = Path('../../DataFolder/ProsodyData/Student_Feature/Frame_Feature_Atten_5')
reference_folder = Path('../../DataFolder/ProsodyData/Siri_Feature/')
# count existent files and non existent files
for index, row in data_table.iterrows():
    filename = row['filename']
    # passage only has 5 digits if contain leading digits it should be removed
    passage_id = int(filename.split('_')[3]) % 100000
    student_id = int(filename.split('_')[1])
    file_path = data_folder / filename
    if file_path.exists():
        data_table.at[index, 'file_exists'] = True
    else:
        data_table.at[index, 'file_exists'] = False
    # Check if passage file exists
    passage_file_path = reference_folder / f'{passage_id}.tfs'
    if passage_file_path.exists():
        data_table.at[index, 'passage_file_exists'] = True
        # Add passage id to the table it need to be intger when adding to the table
        data_table.at[index, 'passage_id'] = passage_id
    else:
        data_table.at[index, 'passage_file_exists'] = False
    # record student id
    data_table.at[index, 'student_id'] = student_id

In [53]:
# filter missing files and rating is not missing(NAN)
data_table = data_table[(data_table['file_exists'] == True) & (data_table['passage_file_exists'] == True) & (data_table['rating'].notna())]
# Correct data type of passage_id
data_table['passage_id'] = data_table['passage_id'].astype(int)
data_table['student_id'] = data_table['student_id'].astype(int)
# Convert rating to float if it is not a covertable value it will be converted to NAN
data_table['rating'] = pd.to_numeric(data_table['rating'], errors='coerce')
# Remove file_exists and passage_file_exists
data_table = data_table.drop(columns=['file_exists', 'passage_file_exists'])
# remove NAN
data_table = data_table.dropna()

In [54]:
data_table.shape

In [55]:
# Group by filename calculate the mean, min, max of the rating also count the number of record has the same filename
data_table = data_table.groupby('filename').agg({'rating': ['mean', 'min', 'max', 'count'], 'passage_id': ['first', 'count'], 'student_id': 'first'}).reset_index()

In [56]:
data_table.shape

In [57]:
data_table

In [58]:
data_table.columns = ['_'.join(col).strip() for col in data_table.columns.values]

data_table = data_table.rename(
    columns={
        'rating_mean': 'rating',
        'passage_id_first': 'passage_id',
        'student_id_first': 'student_id',
        "filename_": "filename",
        "rating_count": "count"
    }
)

In [59]:
# Rating is start from 1 we need to change it to start from 0
data_table['rating'] = data_table['rating'] - 1
# Rating is 0.25 increment we scale it to integer and change dtype to int
data_table['rating'] = (data_table['rating'] * 4).astype(int)
# max and min do not need to be scaled but need to shift to start from 0
data_table['rating_max'] = data_table['rating_max'] - 1
data_table['rating_min'] = data_table['rating_min'] - 1

In [63]:
data_table[(data_table['rating_max'] - data_table['rating_min'] > 1)]

In [27]:
# Filter out the record that has only 1 record and filter out (abnormal) max - min > 1 
data_table = data_table[(data_table['count'] > 1) & (data_table['rating_max'] - data_table['rating_min'] <= 1)]

In [28]:
# Set 0.8 train/val split by student id student allow to have mutiple recording, but student in train set should not appear in a val set
student_ids = data_table['student_id'].tolist()
# sort student id
student_ids.sort()
# get id at 80% of the list
split_id = student_ids[int(len(student_ids) * 0.8)]
print(f'Split id is {split_id}')

In [29]:
# create a new dataframe and calculate round(data_table['rating_max'] - 0.15) , round(data_table['rating_min'] - 0.15) 
newDf = pd.DataFrame()
newDf['rating_max'] = (data_table['rating_max'] - 0.15).apply(round)
newDf['rating_min'] = (data_table['rating_min'] - 0.15).apply(round)
newDf['rating'] = data_table['rating'].astype(float).apply(lambda x: round( x / 4- 0.15))
newDf['original_rating_max'] = data_table['rating_max']
newDf['original_rating_min'] = data_table['rating_min']
newDf['original_rating'] = data_table['rating'].astype(float) / 4
newDf['count'] = data_table['count']
newDf['student_id'] = data_table['student_id']

In [30]:
# Calculate Kappa between rating and rating_max, rating_min
newDf = newDf[newDf['count'] > 1]
newDf = newDf[newDf['student_id'] > split_id]
q_kappa = cohen_kappa_score(newDf['rating_min'], newDf['rating_max'], weights='quadratic')
l_kappa = cohen_kappa_score(newDf['rating_min'], newDf['rating_max'], weights='linear')
accuracy = accuracy_score(newDf['rating_min'], newDf['rating_max'])
print(f'4 Class: Quadratic Kappa: {q_kappa}, Linear Kappa: {l_kappa}, Accuracy: {accuracy}')

In [31]:
# Scale rating to integer by multiplying 4 on rating_min and rating_max for cohen kappa score. copy and return a new dataframe that independent from the original dataframe
data_table_scaled = data_table.copy()
data_table_scaled['rating_max'] = (data_table_scaled['rating_max'] * 4).astype(int)
data_table_scaled['rating_min'] = (data_table_scaled['rating_min'] * 4).astype(int)

In [32]:
# Calculate cohen kappa score
data_table_scaled = data_table_scaled[data_table_scaled['count'] > 1]
data_table_scaled = data_table_scaled[data_table_scaled['student_id'] > split_id]
q_kappa = cohen_kappa_score(data_table_scaled['rating_min'], data_table_scaled['rating_max'], weights='quadratic')
l_kappa = cohen_kappa_score(data_table_scaled['rating_min'], data_table_scaled['rating_max'], weights='linear')
accuracy = accuracy_score(data_table_scaled['rating_min'], data_table_scaled['rating_max'])
print(f'13 Class: Quadratic Kappa: {q_kappa}, Linear Kappa: {l_kappa}, Accuracy: {accuracy}')

In [33]:
# Count student id < 887 and > 887
data_table[data_table['student_id'] <= split_id].shape

In [34]:
data_table[data_table['student_id'] >= split_id].shape

In [35]:
# Check radio of student id < 887
data_table[data_table['student_id'] <= split_id].shape[0] / data_table.shape[0]

In [36]:
# Create a function that generate a tf train example feature is str filename, int rating, int passage_id, int student_id rating_max, rating_min
# rating is integer, max and min is float
def serialize_example(filename, rating, rating_max, rating_min, passage_id, student_id, **kwargs):
    feature = {
        'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename.encode()])),
        'rating': tf.train.Feature(int64_list=tf.train.Int64List(value=[rating])),
        'rating_max': tf.train.Feature(float_list=tf.train.FloatList(value=[rating_max])),
        'rating_min': tf.train.Feature(float_list=tf.train.FloatList(value=[rating_min])),
        'passage_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[passage_id])),
        'student_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[student_id]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [37]:
# Try to serialize an example from data_table [0]
data_sample = data_table.iloc[0]

In [38]:
exp = serialize_example(**data_sample)

In [39]:
# parse the serialized example back to the original data
feature_description = {
    'filename': tf.io.FixedLenFeature([], tf.string),
    'rating': tf.io.FixedLenFeature([], tf.int64),
    'rating_max': tf.io.FixedLenFeature([], tf.float32),
    'rating_min': tf.io.FixedLenFeature([], tf.float32),
    'passage_id': tf.io.FixedLenFeature([], tf.int64),
    'student_id': tf.io.FixedLenFeature([], tf.int64)
}

In [40]:
def _parse_function(example_proto):
    # Parse the input tf.train.Example proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_description)

In [41]:
_parse_function(exp)

In [42]:
data_table.to_csv('../../DataFolder/ProsodyData/data_valid.csv', index=False)

In [43]:
passage_eval_set = {22048,
 22084,
 23041,
 23043,
 32035,
 32038,
 32045,
 32052,
 33006,
 33018,
 33033,
 42086,
 42145,
 43001,
 43017}

In [44]:
eval_path = Path('../../DataFolder/ProsodyData/Score_Record/leave_out_H/eval.tfrecord')
train_path = Path('../../DataFolder/ProsodyData/Score_Record/leave_out_H/train.tfrecord')

In [45]:
# Create parents folder if not exist
eval_path.parent.mkdir(parents=True, exist_ok=True)
train_path.parent.mkdir(parents=True, exist_ok=True)

In [46]:
# # Iterover data_table and write to tfrecord, open both train and eval tfrecord handle and write to the corresponding file
# count_train = 0
# count_eval = 0
# with tf.io.TFRecordWriter(eval_path.as_posix()) as eval_writer, tf.io.TFRecordWriter(train_path.as_posix()) as train_writer:
#     for index, row in data_table.iterrows():
#         example = serialize_example(**row)
#         if row['student_id'] <= split_id and row['passage_id'] not in passage_eval_set:
#             train_writer.write(example)
#             count_train += 1
#         elif row['student_id'] > split_id and row['passage_id'] in passage_eval_set:
#             eval_writer.write(example)
#             count_eval += 1
# 
# print(f'Write {count_train} train records and {count_eval} eval records to tfrecord file')

In [47]:
data_table