In this document, we first extract learners' mind-wandering from the json data. Then we map the data on the timeline. We process the mind-wandering data generated by reporting and questions seperately.

## 1. Data Reading
In this step, we read gaze data of Tobii from a tsv file and mind-wandering reports from a json file.

In [11]:
import pandas as pd
import numpy as np
from pandas import DataFrame

# folderpath_webgazerdata_rprocessed = "../Data_Publish/Data_WebGazer/Data_ProcessedByScript02"
folderpath_webgazerdata_rprocessed = "../Data_Publish/Data_Tobii/Data_ProcessedByScript02"
folderpath_webgazerdata = "../Data_Publish/Data_Event"

### 1.1 Reading Mind-Wandering Data
We only read some important data columns from the json file.

1. Rating
2. Bell Rings

In [12]:
import json
import os

list_filepath_report = []
list_id_report = []
for file in os.listdir(folderpath_webgazerdata):
    if file.endswith(".json"):
        list_filepath_report.append(os.path.join(folderpath_webgazerdata, file))
        list_id_report.append(file[0:-5])
        
print list_filepath_report
print list_id_report

['../Data_Publish/Data_Event/Anon01.json', '../Data_Publish/Data_Event/Anon02.json', '../Data_Publish/Data_Event/Anon03.json', '../Data_Publish/Data_Event/Anon04.json', '../Data_Publish/Data_Event/Anon05.json', '../Data_Publish/Data_Event/Anon06.json', '../Data_Publish/Data_Event/Anon07.json', '../Data_Publish/Data_Event/Anon08.json', '../Data_Publish/Data_Event/Anon09.json', '../Data_Publish/Data_Event/Anon10.json', '../Data_Publish/Data_Event/Anon11.json', '../Data_Publish/Data_Event/Anon12.json', '../Data_Publish/Data_Event/Anon13.json']
['Anon01', 'Anon02', 'Anon03', 'Anon04', 'Anon05', 'Anon06', 'Anon07', 'Anon08', 'Anon09', 'Anon10', 'Anon11', 'Anon12', 'Anon13']


#### Data Format 
id, starttime_iso, endtime_iso, starttime_video, endtime_video, video_length, label

In [13]:
import datetime

df_reports = pd.DataFrame()

for i in range(0, len(list_id_report)):
    # id
    id_str = list_id_report[i]
    print id_str
    # starttime_iso, endtime_iso, starttime_video, endtime_video, video_length, label
    with open(list_filepath_report[i]) as file_json_data:
        json_data = json.load(file_json_data)
        # print(json_data['activity'])
        
        # TODO: full screen playing info
        fullscreen_list = []
        fullscreen_temp = {'enter': '', 'exit': ''}
        for videostatus in json_data['user']['videostatus']:
            if videostatus['status'] == 'Fullscreen_enter':
                fullscreen_temp = {'enter': '', 'exit': ''}
                fullscreen_temp['enter'] = videostatus['time']
            elif videostatus['status'] == 'Fullscreen_exit':
                if (fullscreen_temp['enter'] != ''):
                    fullscreen_temp['exit'] = videostatus['time']
                    fullscreen_list.append(fullscreen_temp)
                    fullscreen_temp = {'enter': '', 'exit': ''}
            elif videostatus['status'] == 'ENDED':
                if (fullscreen_temp['enter'] != ''):
                    fullscreen_temp['exit'] = videostatus['time']
                    fullscreen_list.append(fullscreen_temp)
                    fullscreen_temp = {'enter': '', 'exit': ''}           
        
        print fullscreen_list
        
        pre_video_id = ""
        video_order = 1
        
        for bell in json_data['user']['ratingbells']:
            ## End time is the time when bell rings
            endtime_iso = bell['time']
            # print endtime_iso
            endtime_iso_datetime = datetime.datetime.strptime(endtime_iso, "%Y-%m-%dT%H:%M:%S.%fZ")
            ## Start time is 30 sec before the end time
            starttime_iso_datetime = endtime_iso_datetime - datetime.timedelta(seconds=30)
            (dt, micro) = starttime_iso_datetime.strftime('%Y-%m-%dT%H:%M:%S.%f').split('.')
            starttime_iso = "%s.%03dZ" % (dt, int(micro) / 1000)
            # print starttime_iso
            
            endtime_video = bell['videoTime']
            # print endtime_video
            ## There is no stop in last 30 sec before the bell rings. 
            ## Since each time the video playing starts, they will ring the bell after 30 sec.
            starttime_video = str(float(endtime_video) - 30)
            # print starttime_video
            
            video_length = bell['videoDuration']
            # print video_length

            video_id = ""           
            if float(video_length) < 420:
                video_id = "Nuclear"  
            else:
                video_id = "Solar"
            
            if pre_video_id == "":
                video_order = 1
                pre_video_id = video_id
            elif video_id != pre_video_id:
                video_order = video_order + 1
                pre_video_id = video_id
            
            label = 0
            for rating in json_data['user']['ratings']:
                ratingtime_iso = rating['time']
                # print ratingtime_iso
                ratingtime_iso_datetime = datetime.datetime.strptime(ratingtime_iso, "%Y-%m-%dT%H:%M:%S.%fZ")
                if ratingtime_iso_datetime > endtime_iso_datetime and ratingtime_iso_datetime < endtime_iso_datetime + datetime.timedelta(seconds=10):
                    label = 1
                    break
            # print label          
            
            fullscreen_flag = 0
            for fullscreen_play in fullscreen_list:
                if (starttime_iso > fullscreen_play['enter']) and (endtime_iso < fullscreen_play['exit']):
                    fullscreen_flag = 1
                    break
            
            ## Add data into dataframe
            df_reports = df_reports.append({'id': id_str,
                                            'video_id': video_id,
                                            'video_order': video_order,
                                            'starttime_iso': starttime_iso, 
                                            'endtime_iso': endtime_iso,
                                            'starttime_video': starttime_video,
                                            'endtime_video': endtime_video,
                                            'video_length': video_length,
                                            'label': label,
                                            'fullscreen': fullscreen_flag
                                           }, 
                                           ignore_index=True)
print df_reports.head(10)
print df_reports.shape
print df_reports[df_reports['label'] == 1].shape
print df_reports[df_reports['label'] == 0].shape
print df_reports.video_order.values
print df_reports.video_id.values

Anon01
[]
Anon02
[{'exit': u'2017-04-12T11:28:33.178Z', 'enter': u'2017-04-12T11:20:41.610Z'}, {'exit': u'2017-04-12T11:40:20.780Z', 'enter': u'2017-04-12T11:33:36.025Z'}]
Anon03
[]
Anon04
[{'exit': u'2017-04-06T11:53:39.676Z', 'enter': u'2017-04-06T11:45:46.491Z'}, {'exit': u'2017-04-06T12:05:42.023Z', 'enter': u'2017-04-06T11:58:57.059Z'}]
Anon05
[]
Anon06
[]
Anon07
[{'exit': u'2017-04-13T12:45:32.365Z', 'enter': u'2017-04-13T12:38:52.824Z'}, {'exit': u'2017-04-13T12:59:46.088Z', 'enter': u'2017-04-13T12:51:58.970Z'}]
Anon08
[]
Anon09
[]
Anon10
[]
Anon11
[]
Anon12
[{'exit': u'2017-04-12T15:37:00.416Z', 'enter': u'2017-04-12T15:30:23.180Z'}, {'exit': u'2017-04-12T15:48:18.222Z', 'enter': u'2017-04-12T15:40:29.354Z'}]
Anon13
[{'exit': u'2017-04-12T14:32:31.031Z', 'enter': u'2017-04-12T14:24:43.184Z'}, {'exit': u'2017-04-12T14:43:27.772Z', 'enter': u'2017-04-12T14:36:46.908Z'}]
                endtime_iso       endtime_video  fullscreen      id  label  \
0  2017-04-12T09:28:47.772Z     

### 1.1 Reading WebGazer Gaze Data
We only read some important data columns from the tsv file.

## Feature Extraction

In [14]:
# Data format: id, starttime_iso, endtime_iso, feature 1, feature 2.......
import math
from scipy.stats import kurtosis
from scipy.stats import skew

df_features = pd.DataFrame()

for i in range(0, len(list_id_report)):
    # id
    id_str = list_id_report[i]
    print id_str
    df_reports_withid = df_reports.loc[df_reports['id'] == id_str]
    
    ## Read the tsv file based on id_str
    path_gazedata_rprocessed = os.path.join(folderpath_webgazerdata_rprocessed, id_str + ".csv")
    print path_gazedata_rprocessed
    df_GazeData_rprocessed = DataFrame.from_csv(path_gazedata_rprocessed, sep=",")
    # print df_GazeData_Tobii.head(5)
    df_GazeData_rprocessed = df_GazeData_rprocessed.reset_index()
    
    ## Remove unnessesary data   
    df_GazeData_rprocessed = df_GazeData_rprocessed[['Timestamp_utc',
                                                     'FixationIndex',
                                                     'GazeEventDuration',
                                                     'FixationPointX..MCSpx.',
                                                     'FixationPointY..MCSpx.',
                                                     'AbsoluteSaccadicDirection']]
    df_GazeData_rprocessed.columns = ["Timestamp_utc", 
                                      "FixationIndex", 
                                      "GazeEventDuration", 
                                      "FixationPointX (MCSpx)", 
                                      "FixationPointY (MCSpx)",
                                      "AbsoluteSaccadicDirection"]
    
    # TODO:change the format of Timestamp_utc
    def timestamp_trans(row):
        temp_str = row['Timestamp_utc']
        temp_str = temp_str+"Z"
        temp_str = temp_str.replace(" ", "T")
        return temp_str
    
    df_GazeData_rprocessed['Timestamp_utc'] = df_GazeData_rprocessed.apply(timestamp_trans,axis=1)

    df_GazeData_Tobii = df_GazeData_rprocessed
    
    for index, row in df_reports_withid.iterrows():
        starttime_iso = row['starttime_iso']
        print starttime_iso
        endtime_iso = row['endtime_iso']
        print endtime_iso
        
        ## Select Data from df_GazeData_Tobii based on starttime_iso and endtime_iso
        df_GazeData_Tobii_selected = df_GazeData_Tobii.loc[((df_GazeData_Tobii['Timestamp_utc'] >= starttime_iso) &
                                                           (df_GazeData_Tobii['Timestamp_utc'] <= endtime_iso))
                                                          ]
        
        # print df_GazeData_Tobii_selected.head(20)
        # print df_GazeData_Tobii_selected.shape
        # print df_GazeData_Tobii_selected.columns
        
        ## Global Features: Feature Selection based on selected data
        temp_fixationindex = 0
        temp_timestamp = ""
        temp_FixationPointX = 0
        temp_FixationPointY = 0
        list_fixationduration = []
        list_saccadeduration = []
        list_saccadedistance = []
        list_saccadeangel = []
        
        ## Local Features: Feature Selection based on selected data
        
        # TODO: get the info about the video and fullscreen playing
        video_length = row['video_length']
        fullscreen_flag = row['fullscreen']
        
        face_topleft_x = 0
        face_topleft_y = 0
        face_bottomright_x = 0
        face_bottomright_y = 0
                
        slide_topleft_x = 0
        slide_topleft_y = 0
        slide_bottomright_x = 0
        slide_bottomright_y = 0
                
        subtitle_topleft_x = 0
        subtitle_topleft_y = 0
        subtitle_bottomright_x = 0
        subtitle_bottomright_y = 0
        
        # Select coordinate boundaries for the 
        if video_length <= 410: # Nucl should be replaced by video_id
            if fullscreen_flag:
                
                face_topleft_x = 1191
                face_topleft_y = 239
                face_bottomright_x = 1191+331
                face_bottomright_y = 239+280
                
                slide_topleft_x = 137
                slide_topleft_y = 132
                slide_bottomright_x = 137+929
                slide_bottomright_y = 132+557
                
                subtitle_topleft_x = 402
                subtitle_topleft_y = 892
                subtitle_bottomright_x = 402+1134
                subtitle_bottomright_y = 892+134
            
            else:
                
                face_topleft_x = 1089
                face_topleft_y = 297
                face_bottomright_x = 1089+139
                face_bottomright_y = 297+141
                
                slide_topleft_x = 558
                slide_topleft_y = 246
                slide_bottomright_x = 558+456
                slide_bottomright_y = 246+273
                
                subtitle_topleft_x = 721
                subtitle_topleft_y = 617
                subtitle_bottomright_x = 721+478
                subtitle_bottomright_y = 617+67
        
        elif video_length > 450: # Solar
            
            if fullscreen_flag:
                
                face_topleft_x = 501
                face_topleft_y = 189
                face_bottomright_x = 501+267
                face_bottomright_y = 189+260
                
                slide_topleft_x = 861
                slide_topleft_y = 313
                slide_bottomright_x = 861+811
                slide_bottomright_y = 313+483
                
                subtitle_topleft_x = 458
                subtitle_topleft_y = 900
                subtitle_bottomright_x = 458+1010
                subtitle_bottomright_y = 900+128
            
            else:
                
                face_topleft_x = 721
                face_topleft_y = 263
                face_bottomright_x = 721+146
                face_bottomright_y = 263+139
                
                slide_topleft_x = 913
                slide_topleft_y = 330
                slide_bottomright_x = 913+401
                slide_bottomright_y = 330+241
                
                subtitle_topleft_x = 709
                subtitle_topleft_y = 622
                subtitle_bottomright_x = 709+499
                subtitle_bottomright_y = 622+60
        
        ## Define a basic funtion for calculating whether fixations in aoi or not.
        def isinaoi(fixation_x, fixation_y):            
            if ((fixation_x >= face_topleft_x and fixation_x <= face_bottomright_x) and 
               (fixation_y >= face_topleft_y and fixation_y <= face_bottomright_y)):
                return "face"
            elif ((fixation_x >= subtitle_topleft_x and fixation_x <= subtitle_bottomright_x) and 
               (fixation_y >= subtitle_topleft_y and fixation_y <= subtitle_bottomright_y)):
                return "subtitle"
            elif ((fixation_x >= slide_topleft_x and fixation_x <= slide_bottomright_x) and 
               (fixation_y >= slide_topleft_y and fixation_y <= slide_bottomright_y)): 
                return "slide"
            else:
                return "out"
        
        ## Define local features
        
        # num of saccade jump from one area to another
        num_saccade_aoi_face_out2in = 0
        num_saccade_aoi_face_aoi2in = 0
        num_saccade_aoi_face_in2out = 0
        num_saccade_aoi_face_in2aoi = 0
        num_saccade_aoi_face_within = 0
        
        num_saccade_aoi_slide_out2in = 0
        num_saccade_aoi_slide_aoi2in = 0
        num_saccade_aoi_slide_in2out = 0
        num_saccade_aoi_slide_in2aoi = 0
        num_saccade_aoi_slide_within = 0
        
        num_saccade_aoi_subtitle_out2in = 0
        num_saccade_aoi_subtitle_aoi2in = 0
        num_saccade_aoi_subtitle_in2out = 0
        num_saccade_aoi_subtitle_in2aoi = 0
        num_saccade_aoi_subtitle_within = 0
        
        temp_aoi = "out"
        
        # numbers and durations of fixations in AOIs.
        list_duration_fixation_aoi_face = []
        list_duration_fixation_aoi_subtitle = []
        list_duration_fixation_aoi_slide = []
        # fixations out of AOIs
        list_duration_fixation_aoi_out = []
        
        for index, row in df_GazeData_Tobii_selected.iterrows():
            if np.isnan(row['FixationIndex']):
                continue
            
            if temp_fixationindex == 0:
                temp_fixationindex = row['FixationIndex']
                temp_timestamp = row['Timestamp_utc']
                temp_FixationPointX = row['FixationPointX (MCSpx)']
                temp_FixationPointY = row['FixationPointY (MCSpx)']
                
                list_fixationduration.append(row['GazeEventDuration'])
                list_saccadeangel.append(row['AbsoluteSaccadicDirection'])
                
                ## calculate local features
                current_aoi = isinaoi(row['FixationPointX (MCSpx)'], row['FixationPointY (MCSpx)'])
                if current_aoi == "face":
                    list_duration_fixation_aoi_face.append(row['GazeEventDuration'])
                elif current_aoi == "subtitle":
                    list_duration_fixation_aoi_subtitle.append(row['GazeEventDuration'])
                elif current_aoi == "slide":
                    list_duration_fixation_aoi_slide.append(row['GazeEventDuration'])
                else:
                    list_duration_fixation_aoi_out.append(row['GazeEventDuration'])                
                temp_aoi = current_aoi
            
            elif temp_fixationindex != row['FixationIndex']:
                
                # Global features
                temp_fixationindex = row['FixationIndex']
                list_fixationduration.append(row['GazeEventDuration'])
                list_saccadeangel.append(row['AbsoluteSaccadicDirection'])
                
                datetime_previous = datetime.datetime.strptime(temp_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
                datetime_current = datetime.datetime.strptime(row['Timestamp_utc'], "%Y-%m-%dT%H:%M:%S.%fZ")
                saccadeduration = datetime_current - datetime_previous
                list_saccadeduration.append(float(saccadeduration.total_seconds() * 1000))
                
                FixationPointX_current = row['FixationPointX (MCSpx)']
                FixationPointY_current = row['FixationPointY (MCSpx)']
                saccadedistance = math.sqrt(math.pow((FixationPointX_current - temp_FixationPointX), 2) + 
                                            math.pow((FixationPointY_current - temp_FixationPointY), 2))
                list_saccadedistance.append(saccadedistance)
                
                temp_timestamp = row['Timestamp_utc']
                temp_FixationPointX = row['FixationPointX (MCSpx)']
                temp_FixationPointY = row['FixationPointY (MCSpx)']
                
                # Local features
                current_aoi = isinaoi(row['FixationPointX (MCSpx)'], row['FixationPointY (MCSpx)'])
                
                if current_aoi == "face":
                    
                    list_duration_fixation_aoi_face.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "face":
                        num_saccade_aoi_face_within = num_saccade_aoi_face_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_face_out2in = num_saccade_aoi_face_out2in + 1
                    else:
                        num_saccade_aoi_face_aoi2in = num_saccade_aoi_face_aoi2in + 1
                        if temp_aoi == "slide":
                            num_saccade_aoi_slide_in2aoi = num_saccade_aoi_slide_in2aoi + 1
                        else:
                            num_saccade_aoi_subtitle_in2aoi = num_saccade_aoi_subtitle_in2aoi + 1
                        
                elif current_aoi == "subtitle":
                    
                    list_duration_fixation_aoi_subtitle.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "subtitle":
                        num_saccade_aoi_subtitle_within = num_saccade_aoi_subtitle_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_subtitle_out2in = num_saccade_aoi_subtitle_out2in + 1
                    else:
                        num_saccade_aoi_subtitle_aoi2in = num_saccade_aoi_subtitle_aoi2in + 1
                        if temp_aoi == "face":
                            num_saccade_aoi_face_in2aoi = num_saccade_aoi_face_in2aoi + 1
                        else:
                            num_saccade_aoi_slide_in2aoi = num_saccade_aoi_slide_in2aoi + 1
                    
                elif current_aoi == "slide":
                    list_duration_fixation_aoi_slide.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "slide":
                        num_saccade_aoi_slide_within = num_saccade_aoi_slide_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_slide_out2in = num_saccade_aoi_slide_out2in + 1
                    else:
                        num_saccade_aoi_slide_aoi2in = num_saccade_aoi_slide_aoi2in + 1
                        if temp_aoi == "face":
                            num_saccade_aoi_face_in2aoi = num_saccade_aoi_face_in2aoi + 1
                        else:
                            num_saccade_aoi_subtitle_in2aoi = num_saccade_aoi_subtitle_in2aoi + 1
                    
                else:
                    list_duration_fixation_aoi_out.append(row['GazeEventDuration'])
                    if temp_aoi == "slide":
                        num_saccade_aoi_slide_in2out = num_saccade_aoi_slide_in2out + 1
                    elif temp_aoi == "face":
                        num_saccade_aoi_face_in2out = num_saccade_aoi_face_in2out + 1
                    elif temp_aoi == "subtitle":
                        num_saccade_aoi_subtitle_in2out = num_saccade_aoi_subtitle_in2out + 1
                 
                temp_aoi = current_aoi
            
            else:
                temp_timestamp = row['Timestamp_utc']
    
        
        num_saccade_horizon = sum(1 for i in list_saccadeangel if ((i <= 30 and i >= -30) or (i >= 150 and i <= 210) or (i >= 330)))
#         print num_saccade_horizon
#         print len(list_fixationduration)
#         print len(list_saccadeduration)
#         print len(list_saccadedistance)
#         print len(list_saccadeangel)
#         print len(list_duration_fixation_aoi_face)
#         print len(list_duration_fixation_aoi_subtitle)
#         print len(list_duration_fixation_aoi_slide)
#         print len(list_duration_fixation_aoi_out)
        
        duration_fixation_aoi_face = 0
        duration_fixation_aoi_face_max = 0
        duration_fixation_aoi_subtitle = 0
        duration_fixation_aoi_subtitle_max = 0
        duration_fixation_aoi_slide = 0
        duration_fixation_aoi_slide_max = 0
        duration_fixation_aoi_out = 0
        duration_fixation_aoi_out_max = 0 
        
        if len(list_duration_fixation_aoi_face) != 0:
            duration_fixation_aoi_face = sum(list_duration_fixation_aoi_face)/sum(list_fixationduration)
            duration_fixation_aoi_face_max = np.max(list_duration_fixation_aoi_face)
        if len(list_duration_fixation_aoi_subtitle) != 0:
            duration_fixation_aoi_subtitle = sum(list_duration_fixation_aoi_subtitle)/sum(list_fixationduration)
            duration_fixation_aoi_subtitle_max = np.max(list_duration_fixation_aoi_subtitle)
        if len(list_duration_fixation_aoi_slide) != 0:
            duration_fixation_aoi_slide = sum(list_duration_fixation_aoi_slide)/sum(list_fixationduration)
            duration_fixation_aoi_slide_max = np.max(list_duration_fixation_aoi_slide)
        if len(list_duration_fixation_aoi_out) != 0:
            duration_fixation_aoi_out = sum(list_duration_fixation_aoi_out)/sum(list_fixationduration)
            duration_fixation_aoi_out_max = np.max(list_duration_fixation_aoi_out)
        
        ## Add features into df_features
        df_features = df_features.append({
                'id': id_str, 
                'starttime_iso': starttime_iso, 
                'endtime_iso': endtime_iso,
                'fixationduration_min': np.min(list_fixationduration),
                'fixationduration_max': np.max(list_fixationduration),
                'fixationduration_mean': np.mean(list_fixationduration),
                'fixationduration_median': np.median(list_fixationduration),
                'fixationduration_stddev': np.std(list_fixationduration),
                'fixationduration_range': np.max(list_fixationduration) - np.min(list_fixationduration),
                'fixationduration_kurtosis': kurtosis(list_fixationduration),
                'fixationduration_skew': skew(list_fixationduration),
                'saccadeduration_min': np.min(list_saccadeduration),
                'saccadeduration_max': np.max(list_saccadeduration),
                'saccadeduration_mean': np.mean(list_saccadeduration),
                'saccadeduration_median': np.median(list_saccadeduration),
                'saccadeduration_stddev': np.std(list_saccadeduration),
                'saccadeduration_range': np.max(list_saccadeduration) - np.min(list_saccadeduration),
                'saccadeduration_kurtosis': kurtosis(list_saccadeduration),
                'saccadeduration_skew': skew(list_saccadeduration),
                'saccadedistance_min': np.min(list_saccadedistance),
                'saccadedistance_max': np.max(list_saccadedistance),
                'saccadedistance_mean': np.mean(list_saccadedistance),
                'saccadedistance_median': np.median(list_saccadedistance),
                'saccadedistance_stddev': np.std(list_saccadedistance),
                'saccadedistance_range': np.max(list_saccadedistance) - np.min(list_saccadedistance),
                'saccadedistance_kurtosis': kurtosis(list_saccadedistance),
                'saccadedistance_skew': skew(list_saccadedistance),
                'saccadeangel_min': np.min(list_saccadeangel),
                'saccadeangel_max': np.max(list_saccadeangel),
                'saccadeangel_mean': np.mean(list_saccadeangel),
                'saccadeangel_median': np.median(list_saccadeangel),
                'saccadeangel_stddev': np.std(list_saccadeangel),
                'saccadeangel_range': np.max(list_saccadeangel) - np.min(list_saccadeangel),
                'saccadeangel_kurtosis': kurtosis(list_saccadeangel),
                'saccadeangel_skew': skew(list_saccadeangel),
                'saccade_num': len(list_saccadeduration),
                'saccade_horizonratio': float(num_saccade_horizon)/len(list_saccadeangel),
                'fixation_saccade_ratio': sum(list_fixationduration)/sum(list_saccadeduration),
                ## LOCAL FEATURES
                'num_saccade_aoi_face_out2in': float(num_saccade_aoi_face_out2in)/len(list_saccadeduration),
                'num_saccade_aoi_face_aoi2in': float(num_saccade_aoi_face_aoi2in)/len(list_saccadeduration),
                'num_saccade_aoi_face_in2out': float(num_saccade_aoi_face_in2out)/len(list_saccadeduration),
                'num_saccade_aoi_face_in2aoi': float(num_saccade_aoi_face_in2aoi)/len(list_saccadeduration),
                'num_saccade_aoi_face_within': float(num_saccade_aoi_face_within)/len(list_saccadeduration),
                'num_saccade_aoi_slide_out2in': float(num_saccade_aoi_slide_out2in)/len(list_saccadeduration),
                'num_saccade_aoi_slide_aoi2in': float(num_saccade_aoi_slide_aoi2in)/len(list_saccadeduration),
                'num_saccade_aoi_slide_in2out': float(num_saccade_aoi_slide_in2out)/len(list_saccadeduration),
                'num_saccade_aoi_slide_in2aoi': float(num_saccade_aoi_slide_in2aoi)/len(list_saccadeduration),
                'num_saccade_aoi_slide_within': float(num_saccade_aoi_slide_within)/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_out2in': float(num_saccade_aoi_subtitle_out2in)/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_aoi2in': float(num_saccade_aoi_subtitle_aoi2in)/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_in2out': float(num_saccade_aoi_subtitle_in2out)/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_in2aoi': float(num_saccade_aoi_subtitle_in2aoi)/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_within': float(num_saccade_aoi_subtitle_within)/len(list_saccadeduration),
                'duration_fixation_aoi_face': duration_fixation_aoi_face,
                'duration_fixation_aoi_face_max': duration_fixation_aoi_face_max,
                'duration_fixation_aoi_subtitle': duration_fixation_aoi_subtitle,
                'duration_fixation_aoi_subtitle_max': duration_fixation_aoi_subtitle_max,
                'duration_fixation_aoi_slide': duration_fixation_aoi_slide,
                'duration_fixation_aoi_slide_max': duration_fixation_aoi_slide_max,
                'duration_fixation_aoi_out': duration_fixation_aoi_out,
                'duration_fixation_aoi_out_max': duration_fixation_aoi_out_max
            }, ignore_index=True)
        
        # print df_features.head(1)
print df_features.shape

Anon01
../Data_Publish/Data_Tobii/Data_ProcessedByScript02/Anon01.csv
2017-04-12T09:28:17.772Z
2017-04-12T09:28:47.772Z
2017-04-12T09:29:04.224Z
2017-04-12T09:29:34.224Z
2017-04-12T09:30:05.276Z
2017-04-12T09:30:35.276Z
2017-04-12T09:31:06.727Z
2017-04-12T09:31:36.727Z
2017-04-12T09:32:03.103Z
2017-04-12T09:32:33.103Z
2017-04-12T09:33:04.025Z
2017-04-12T09:33:34.025Z
2017-04-12T09:33:41.537Z
2017-04-12T09:34:11.537Z
2017-04-12T09:39:59.560Z
2017-04-12T09:40:29.560Z
2017-04-12T09:40:58.162Z
2017-04-12T09:41:28.162Z
2017-04-12T09:41:57.429Z
2017-04-12T09:42:27.429Z
2017-04-12T09:42:59.020Z
2017-04-12T09:43:29.020Z
2017-04-12T09:43:50.584Z
2017-04-12T09:44:20.584Z
2017-04-12T09:44:34.770Z
2017-04-12T09:45:04.770Z
2017-04-12T09:45:16.779Z
2017-04-12T09:45:46.779Z
2017-04-12T09:46:05.371Z
2017-04-12T09:46:35.371Z
2017-04-12T09:46:38.563Z
2017-04-12T09:47:08.563Z
Anon02
../Data_Publish/Data_Tobii/Data_ProcessedByScript02/Anon02.csv
2017-04-12T11:20:52.331Z
2017-04-12T11:21:22.331Z
2017-04-12

2017-04-11T11:01:22.793Z
2017-04-11T11:01:52.793Z
2017-04-11T11:02:05.365Z
2017-04-11T11:02:35.365Z
2017-04-11T11:02:58.654Z
2017-04-11T11:03:28.654Z
2017-04-11T11:04:07.884Z
2017-04-11T11:04:37.884Z
Anon11
../Data_Publish/Data_Tobii/Data_ProcessedByScript02/Anon11.csv
2017-04-06T09:44:01.073Z
2017-04-06T09:44:31.073Z
2017-04-06T09:45:01.704Z
2017-04-06T09:45:31.704Z
2017-04-06T09:45:57.036Z
2017-04-06T09:46:27.036Z
2017-04-06T09:46:34.970Z
2017-04-06T09:47:04.970Z
2017-04-06T09:47:14.614Z
2017-04-06T09:47:44.614Z
2017-04-06T09:47:56.285Z
2017-04-06T09:48:26.285Z
2017-04-06T09:49:03.751Z
2017-04-06T09:49:33.751Z
2017-04-06T09:50:06.501Z
2017-04-06T09:50:36.501Z
2017-04-06T09:50:50.318Z
2017-04-06T09:51:20.318Z
2017-04-06T09:55:47.437Z
2017-04-06T09:56:17.437Z
2017-04-06T09:56:32.515Z
2017-04-06T09:57:02.515Z
2017-04-06T09:57:26.172Z
2017-04-06T09:57:56.172Z
2017-04-06T09:58:21.711Z
2017-04-06T09:58:51.711Z
2017-04-06T09:58:53.203Z
2017-04-06T09:59:23.203Z
2017-04-06T10:00:01.508Z
2017-

In [15]:
## Merge features with labels
df_merge = pd.merge(df_reports, df_features)
# df_merge.to_csv("features_labels_webgazer.csv", index=False)
df_merge.to_csv("features_labels_tobii_sd.csv", index=False)

In [16]:
print df_merge.columns
print df_merge.columns[10:18]
print df_merge.columns[18:27]
print df_merge.columns[27:42]
print df_merge.columns[42:]
print len(df_merge.columns)

Index([u'endtime_iso', u'endtime_video', u'fullscreen', u'id', u'label',
       u'starttime_iso', u'starttime_video', u'video_id', u'video_length',
       u'video_order', u'duration_fixation_aoi_face',
       u'duration_fixation_aoi_face_max', u'duration_fixation_aoi_out',
       u'duration_fixation_aoi_out_max', u'duration_fixation_aoi_slide',
       u'duration_fixation_aoi_slide_max', u'duration_fixation_aoi_subtitle',
       u'duration_fixation_aoi_subtitle_max', u'fixation_saccade_ratio',
       u'fixationduration_kurtosis', u'fixationduration_max',
       u'fixationduration_mean', u'fixationduration_median',
       u'fixationduration_min', u'fixationduration_range',
       u'fixationduration_skew', u'fixationduration_stddev',
       u'num_saccade_aoi_face_aoi2in', u'num_saccade_aoi_face_in2aoi',
       u'num_saccade_aoi_face_in2out', u'num_saccade_aoi_face_out2in',
       u'num_saccade_aoi_face_within', u'num_saccade_aoi_slide_aoi2in',
       u'num_saccade_aoi_slide_in2aoi', u'num

## Generate the results of the baseline
Assumin that we know the mind-wandering rate is 0.29, the baseline is the precision, recall and f1 when we totally guess the result with 0.29 mind-wandering rate.

In [7]:
from random import random, seed
from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)

# read the data
df_merge = pd.read_csv("features_labels_webgazer.csv")
# read the label
y_test = df_merge.ix[:, 4]
print df_merge.shape

repeat_parameter = 10000
seed_parameter = 48 # same as the example in SMOTE

list_test = list()
for i in range(0, repeat_parameter):
    list_test = list_test + list(y_test)

seed(seed_parameter)
list_guess = list()
for i in range(0, len(list_test)):
    temp_random = random()
    if temp_random > 0.29:
        list_guess.append(0)
    else:
        list_guess.append(1)

print "Baseline with 0.29 mind-wandering rate"
print("\tPrecision: %1.3f" % precision_score(list_test, list_guess))
print("\tRecall: %1.3f" % recall_score(list_test, list_guess))
print("\tF1: %1.3f" % f1_score(list_test, list_guess))
print("\tAccuracy: %1.3f\n" % accuracy_score(list_test, list_guess))

(200, 68)
Baseline with 0.29 mind-wandering rate
	Precision: 0.290
	Recall: 0.291
	F1: 0.290
	Accuracy: 0.588



## Prediction With Different Features and Models

Nested cross validation is used in our experiment.
Each time we select data of one participant as test data and the remaining as training data. Then, for the training data we use leave-one-paticipant-out cross validation to selection model and predict the result of the test data.

In [7]:
import pandas as pd
import numpy as np
from pandas import DataFrame

from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import (GridSearchCV, LeaveOneGroupOut)
from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)

# df_merge = pd.read_csv("features_labels_webgazer.csv")
df_merge = pd.read_csv("features_labels_tobii_sd.csv")

# Remove 3 participants since they only have 1 mind-wandering report in their test
df_merge = df_merge[df_merge.id != "Anon06"]
df_merge = df_merge[df_merge.id != "Anon07"]
df_merge = df_merge[df_merge.id != "Anon13"]

list_id_report = list(df_merge['id'].unique())

# Define different features running in the experiments
feature_index_local = [10,11,12,13,14,15,16,17,
                       27,28,29,30,31,32,33,34,35,36,37,38,39,40,41]
feature_index_local_dict = {"feature_name": "Local Features", 
                            "feature_index": feature_index_local}
feature_index_global = [18,19,20,21,22,23,24,25,26,
                        42,43,44,45,46,47,48,49,50,
                        51,52,53,54,55,56,57,58,59,
                        60,61,62,63,64,65,66,67]
feature_index_global_dict = {"feature_name": "Global Features", 
                             "feature_index": feature_index_global}
feature_index_all = range(10, 68)
feature_index_all_dict = {"feature_name": "All Features", 
                          "feature_index": feature_index_all}
featuren_list = [feature_index_all_dict, 
                 feature_index_global_dict, 
                 feature_index_local_dict]

# Define pipelines
sm = SMOTE(random_state=48)

pipe_svm_dict = {"pipe_name": "Linear SVM without SMOTE methods",
                 "pipe": Pipeline([('classify', LinearSVC())]),
                 "p_grid": {"classify__loss": ['hinge', 'squared_hinge'],
                            "classify__tol": [0.1, 0.01, 0.001, 0.0001],
                            "classify__C": [100, 10, 1, 0.1],
                            "classify__class_weight": [None, 'balanced']}
                }
pipe_svm_sm_dict = {"pipe_name": "Linear SVM with SMOTE methods",
                    "pipe": Pipeline([('balanced', sm), 
                                      ('classify', LinearSVC())]),
                    "p_grid": {"classify__loss": ['hinge', 'squared_hinge'],
                               "classify__tol": [0.1, 0.01, 0.001, 0.0001],
                               "classify__C": [100, 10, 1, 0.1],
                               "classify__class_weight": [None, 'balanced']}}
pipe_lr_dict = {"pipe_name": "Logistic Regression without SMOTE methods", 
                "pipe": Pipeline([('classify', LogisticRegression())]),
                "p_grid": {"classify__penalty": ['l1', 'l2'],
                           "classify__tol": [0.1, 0.01, 0.001, 0.0001],
                           "classify__C": [100, 10, 1, 0.1],
                           "classify__class_weight": [None, 'balanced']}
               }
pipe_lr_sm_dict = {"pipe_name": "Logistic Regression with SMOTE methods", 
                   "pipe": Pipeline([('balanced', sm), 
                                     ('classify', LogisticRegression())]),
                   "p_grid": {"classify__penalty": ['l1', 'l2'],
                              "classify__tol": [0.1, 0.01, 0.001, 0.0001],
                              "classify__C": [100, 10, 1, 0.1],
                              "classify__class_weight": [None, 'balanced']}
                  }
pipe_nb_dict = {"pipe_name": "Naive Bayes without SMOTE methods",
                "pipe": Pipeline([('classify', GaussianNB())]),
                "p_grid": {}
               }
pipe_nb_sm_dict = {"pipe_name": "Naive Bayes with SMOTE methods",
                   "pipe": Pipeline([('balanced', sm), 
                                     ('classify', GaussianNB())]),
                   "p_grid": {}
                  }

pipe_list = [pipe_svm_dict, 
             pipe_svm_sm_dict, 
             pipe_lr_dict, 
             pipe_lr_sm_dict, 
             pipe_nb_dict, 
             pipe_nb_sm_dict]

for feature_index_dict in featuren_list:
    featurn_name = feature_index_dict["feature_name"]
    feature_index = feature_index_dict["feature_index"]
    
    print "-----------------------------------"
    print featurn_name
    print "-----------------------------------"
    
    for pipe_dict in pipe_list:
        print pipe_dict["pipe_name"]

        y_test_total = []
        y_pred_total = []
        f1_list = []
        precision_list = []
        recall_list = []

        ## Leave-one-participant-out machine learning methods (try Logistic Regression first)
        for i in range(0, len(list_id_report)):

            id_str = list_id_report[i]
            print id_str

            # outer cv, split data for the training and test
            data_train = df_merge.ix[df_merge['id'] != id_str]
            data_test = df_merge.ix[df_merge['id'] == id_str]

            # inner cv, data preparation
            X_train = data_train.ix[:, feature_index].fillna(value=0)
            y_train = list(data_train.ix[:, 4])
            group = list(data_train.ix[:, 3])

            # inner cv, we define the pipeline 
            pipe = pipe_dict["pipe"]
            p_grid = pipe_dict["p_grid"]
            inner_cv = LeaveOneGroupOut()

            # inner cv, model selection by gridsearch
            clf = GridSearchCV(estimator=pipe, param_grid=p_grid, cv=inner_cv, scoring='f1')
            clf.fit(X_train, y_train, group)
            print "inner cv parameters: " + str(clf.best_params_)
            print "inner cv f1 score: " + str(clf.best_score_)

            # outer cv, data preparation
            X_test = data_test.ix[:, feature_index].fillna(value=0)
            y_test = data_test.ix[:, 4]

            y_test_total.extend(y_test)

            y_pred = clf.predict(X_test)
            y_pred_total.extend(y_pred)

            print("outter cv results")
            print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
            print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
            print("\tF1: %1.3f" % f1_score(y_test, y_pred))
            print("\tAccuracy: %1.3f\n" % accuracy_score(y_test, y_pred))
            
            f1_list.append(f1_score(y_test, y_pred))
            precision_list.append(precision_score(y_test, y_pred))
            recall_list.append(recall_score(y_test, y_pred))

        print "Nested cv results"
#         print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_total))
#         print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_total))
#         print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_total))
#         print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_total))
        print("\tF1: %1.3f" % np.mean(f1_list))
        print("\tPrecision: %1.3f" % np.mean(precision_list))
        print("\tRecall: %1.3f" % np.mean(recall_list))

-----------------------------------
All Features
-----------------------------------
Linear SVM without SMOTE methods
Anon01
inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.001, 'classify__loss': 'squared_hinge', 'classify__class_weight': 'balanced'}
inner cv f1 score: 0.354245730787
outter cv results
	Precision: 0.267
	Recall: 0.800
	F1: 0.400
	Accuracy: 0.250

Anon02
inner cv parameters: {'classify__C': 1, 'classify__tol': 0.001, 'classify__loss': 'squared_hinge', 'classify__class_weight': None}
inner cv f1 score: 0.356826068557
outter cv results
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.529

Anon03
inner cv parameters: {'classify__C': 100, 'classify__tol': 0.01, 'classify__loss': 'hinge', 'classify__class_weight': None}
inner cv f1 score: 0.439715148607
outter cv results
	Precision: 0.400
	Recall: 1.000
	F1: 0.571
	Accuracy: 0.400

Anon04
inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.01, 'classify__loss': 'hinge', 'classify__class_weight': 

inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.1, 'classify__class_weight': None, 'classify__penalty': 'l1'}
inner cv f1 score: 0.385242417157
outter cv results
	Precision: 0.333
	Recall: 0.667
	F1: 0.444
	Accuracy: 0.333

Anon04
inner cv parameters: {'classify__C': 100, 'classify__tol': 0.0001, 'classify__class_weight': None, 'classify__penalty': 'l2'}
inner cv f1 score: 0.385552484097
outter cv results
	Precision: 0.200
	Recall: 0.500
	F1: 0.286
	Accuracy: 0.333

Anon05
inner cv parameters: {'classify__C': 10, 'classify__tol': 0.0001, 'classify__class_weight': None, 'classify__penalty': 'l2'}
inner cv f1 score: 0.398488720437
outter cv results
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.667

Anon08
inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.1, 'classify__class_weight': None, 'classify__penalty': 'l1'}
inner cv f1 score: 0.404770013281
outter cv results
	Precision: 0.091
	Recall: 0.500
	F1: 0.154
	Accuracy: 0.267

Anon09
inner cv parameters

inner cv parameters: {'classify__C': 100, 'classify__tol': 0.1, 'classify__loss': 'squared_hinge', 'classify__class_weight': 'balanced'}
inner cv f1 score: 0.45189596854
outter cv results
	Precision: 0.400
	Recall: 1.000
	F1: 0.571
	Accuracy: 0.400

Anon04
inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.01, 'classify__loss': 'squared_hinge', 'classify__class_weight': None}
inner cv f1 score: 0.479356935214
outter cv results
	Precision: 0.200
	Recall: 0.500
	F1: 0.286
	Accuracy: 0.333

Anon05
inner cv parameters: {'classify__C': 1, 'classify__tol': 0.01, 'classify__loss': 'squared_hinge', 'classify__class_weight': None}
inner cv f1 score: 0.429481952675
outter cv results
	Precision: 0.267
	Recall: 1.000
	F1: 0.421
	Accuracy: 0.267

Anon08
inner cv parameters: {'classify__C': 100, 'classify__tol': 0.0001, 'classify__loss': 'hinge', 'classify__class_weight': None}
inner cv f1 score: 0.548346300955
outter cv results
	Precision: 0.133
	Recall: 1.000
	F1: 0.235
	Accuracy: 0.133


inner cv parameters: {}
inner cv f1 score: 0.319616553659
outter cv results
	Precision: 0.167
	Recall: 0.250
	F1: 0.200
	Accuracy: 0.467

Nested cv results
	F1: 0.296
	Precision: 0.376
	Recall: 0.349
Naive Bayes with SMOTE methods
Anon01
inner cv parameters: {}
inner cv f1 score: 0.374253270539
outter cv results
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.688

Anon02
inner cv parameters: {}
inner cv f1 score: 0.391872100968
outter cv results
	Precision: 0.500
	Recall: 0.143
	F1: 0.222
	Accuracy: 0.588

Anon03
inner cv parameters: {}
inner cv f1 score: 0.370455298811
outter cv results
	Precision: 0.333
	Recall: 0.667
	F1: 0.444
	Accuracy: 0.333

Anon04
inner cv parameters: {}
inner cv f1 score: 0.332640218367
outter cv results
	Precision: 0.273
	Recall: 0.750
	F1: 0.400
	Accuracy: 0.400

Anon05
inner cv parameters: {}
inner cv f1 score: 0.416506759592
outter cv results
	Precision: 0.125
	Recall: 0.250
	F1: 0.167
	Accuracy: 0.333

Anon08
inner cv parameters: {}
inner cv f1 s

inner cv parameters: {'classify__C': 1, 'classify__tol': 0.0001, 'classify__class_weight': 'balanced', 'classify__penalty': 'l2'}
inner cv f1 score: 0.443620497845
outter cv results
	Precision: 0.143
	Recall: 0.500
	F1: 0.222
	Accuracy: 0.533

Anon09
inner cv parameters: {'classify__C': 0.1, 'classify__tol': 0.0001, 'classify__class_weight': 'balanced', 'classify__penalty': 'l2'}
inner cv f1 score: 0.45969823827
outter cv results
	Precision: 1.000
	Recall: 0.667
	F1: 0.800
	Accuracy: 0.812

Anon10
inner cv parameters: {'classify__C': 1, 'classify__tol': 0.0001, 'classify__class_weight': 'balanced', 'classify__penalty': 'l2'}
inner cv f1 score: 0.404841936775
outter cv results
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.312

Anon11
inner cv parameters: {'classify__C': 100, 'classify__tol': 0.01, 'classify__class_weight': 'balanced', 'classify__penalty': 'l2'}
inner cv f1 score: 0.4589569161
outter cv results
	Precision: 0.333
	Recall: 0.111
	F1: 0.167
	Accuracy: 0.375

Anon

## 1.1 Whether or not predictions can be made equally well for all participants?

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

df_merge = pd.read_csv("features_labels_webgazer.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_bayes_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    print id_str
    print "Run: " + str(i)
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # print data_test.shape
    
    X_train = data_train.ix[:, 10:].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, 10:].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    
    print("\tPrecision: %1.3f" % precision_score(y_test, y_pred_bayes))
    print("\tRecall: %1.3f" % recall_score(y_test, y_pred_bayes))
    print("\tF1: %1.3f" % f1_score(y_test, y_pred_bayes))
    print("\tAccuracy: %1.3f\n" % accuracy_score(y_test, y_pred_bayes))
    
    list_pre.append(precision_score(y_test, y_pred_bayes))
    list_rec.append(recall_score(y_test, y_pred_bayes))
    list_f1.append(f1_score(y_test, y_pred_bayes))

print "Final Results: L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))
print("Precision")
print np.max(list_pre)
print np.min(list_pre)
print np.mean(list_pre)
print np.median(list_pre)
print np.std(list_pre)
print("Recall")
print np.max(list_rec)
print np.min(list_rec)
print np.mean(list_rec)
print np.median(list_rec)
print np.std(list_rec)
print("F1")
print np.max(list_f1)
print np.min(list_f1)
print np.mean(list_f1)
print np.median(list_f1)
print np.std(list_f1)

Anon01
Run: 0
	Precision: 0.500
	Recall: 0.400
	F1: 0.444
	Accuracy: 0.688

Anon02
Run: 1
	Precision: 0.667
	Recall: 0.571
	F1: 0.615
	Accuracy: 0.706

Anon03
Run: 2
	Precision: 0.400
	Recall: 0.667
	F1: 0.500
	Accuracy: 0.467

Anon04
Run: 3
	Precision: 0.333
	Recall: 0.750
	F1: 0.462
	Accuracy: 0.533

Anon05
Run: 4
	Precision: 0.364
	Recall: 1.000
	F1: 0.533
	Accuracy: 0.533

Anon06
Run: 5
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.800

Anon07
Run: 6
	Precision: 0.125
	Recall: 1.000
	F1: 0.222
	Accuracy: 0.500

Anon08
Run: 7
	Precision: 0.154
	Recall: 1.000
	F1: 0.267
	Accuracy: 0.267

Anon09
Run: 8
	Precision: 0.545
	Recall: 0.667
	F1: 0.600
	Accuracy: 0.500

Anon10
Run: 9
	Precision: 0.357
	Recall: 1.000
	F1: 0.526
	Accuracy: 0.438

Anon11
Run: 10
	Precision: 0.600
	Recall: 1.000
	F1: 0.750
	Accuracy: 0.625

Anon12
Run: 11
	Precision: 0.308
	Recall: 1.000
	F1: 0.471
	Accuracy: 0.400

Anon13
Run: 12
	Precision: 0.083
	Recall: 1.000
	F1: 0.154
	Accuracy: 0.267

Final Res

## 1.2 whether or not predictions can be made well across the entire length of the lecture videos?

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

df_merge = pd.read_csv("features_labels_webgazer.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_bayes_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]    
    
    # data_train = df_merge.ix[df_merge['id'] != id_str]
    # data_train = data_train.ix[data_train['video_id'] == "Solar"]
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    # data_train = df_merge
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    data_train = df_merge
    data_train = data_train.ix[((data_train['video_id'] == "Solar")&
                                (data_test['endtime_video']/data_test['video_length'] >= 0.5))|
                               (data_train['video_id'] != "Solar")
                              ]
    
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    data_test = data_test.ix[(data_test['video_id'] == "Solar")&(data_test['endtime_video']/data_test['video_length'] < 0.5)]
    # print data_test.shape
    X_train = data_train.ix[:, 10:].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, 10:].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    
    list_pre.append(precision_score(y_test, y_pred_bayes))
    list_rec.append(recall_score(y_test, y_pred_bayes))
    list_f1.append(f1_score(y_test, y_pred_bayes))

print "Final Results: GaussianNB"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))

Final Results: GaussianNB
	Precision: 0.458
	Recall: 0.846
	F1: 0.595
	Accuracy: 0.706



## 1.3 whether or not a model trained on one video translates to good predictions in other videos?

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_webgazer.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_bayes_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]    
    
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # data_train = data_train.ix[data_train['video_id'] == "Solar"]
    data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    # data_train = df_merge
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    #  data_train = df_merge
    #  data_train = data_train.ix[data_train['video_id'] == "Solar"]
    
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # data_test = data_test.ix[data_test['video_id'] == "Solar"]
    data_test = data_test.ix[data_test['video_id'] != "Solar"]
    
    # print data_test.shape
    X_train = data_train.ix[:, 10:].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, 10:].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    
    list_pre.append(precision_score(y_test, y_pred_bayes))
    list_rec.append(recall_score(y_test, y_pred_bayes))
    list_f1.append(f1_score(y_test, y_pred_bayes))

print "Final Results: GaussianNB"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))

Final Results: GaussianNB
	Precision: 0.407
	Recall: 0.815
	F1: 0.543
	Accuracy: 0.593

