In this document, we first extract learners' mind-wandering from the json data. Then we map the data on the timeline. We process the mind-wandering data generated by reporting and questions seperately.

## 1. Data Reading
In this step, we read gaze data of Tobii from a tsv file and mind-wandering reports from a json file.

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame

folderpath_tobiidata = "../Data_Publish/Data_Tobii"
folderpath_webgazerdata = "../Data_Publish/Data_Event"

### 1.1 Reading Mind-Wandering Data
We only read some important data columns from the json file.

1. Rating
2. Bell Rings

In [2]:
import json
import os

list_filepath_report = []
list_id_report = []
for file in os.listdir(folderpath_webgazerdata):
    if file.endswith(".json"):
        list_filepath_report.append(os.path.join(folderpath_webgazerdata, file))
        list_id_report.append(file[0:-5])
        
print list_filepath_report
print list_id_report

['../Data_Publish/Data_Event/Anon01.json', '../Data_Publish/Data_Event/Anon02.json', '../Data_Publish/Data_Event/Anon03.json', '../Data_Publish/Data_Event/Anon04.json', '../Data_Publish/Data_Event/Anon05.json', '../Data_Publish/Data_Event/Anon06.json', '../Data_Publish/Data_Event/Anon07.json', '../Data_Publish/Data_Event/Anon08.json', '../Data_Publish/Data_Event/Anon09.json', '../Data_Publish/Data_Event/Anon10.json', '../Data_Publish/Data_Event/Anon11.json', '../Data_Publish/Data_Event/Anon12.json', '../Data_Publish/Data_Event/Anon13.json']
['Anon01', 'Anon02', 'Anon03', 'Anon04', 'Anon05', 'Anon06', 'Anon07', 'Anon08', 'Anon09', 'Anon10', 'Anon11', 'Anon12', 'Anon13']


#### Data Format 
id, starttime_iso, endtime_iso, starttime_video, endtime_video, video_length, label

In [3]:
import datetime

df_reports = pd.DataFrame()

for i in range(0, len(list_id_report)):
    # id
    id_str = list_id_report[i]
    print id_str
    # starttime_iso, endtime_iso, starttime_video, endtime_video, video_length, label
    with open(list_filepath_report[i]) as file_json_data:
        json_data = json.load(file_json_data)
        # print(json_data['activity'])
        
        # TODO: full screen playing info
        fullscreen_list = []
        fullscreen_temp = {'enter': '', 'exit': ''}
        for videostatus in json_data['user']['videostatus']:
            if videostatus['status'] == 'Fullscreen_enter':
                fullscreen_temp = {'enter': '', 'exit': ''}
                fullscreen_temp['enter'] = videostatus['time']
            elif videostatus['status'] == 'Fullscreen_exit':
                if (fullscreen_temp['enter'] != ''):
                    fullscreen_temp['exit'] = videostatus['time']
                    fullscreen_list.append(fullscreen_temp)
                    fullscreen_temp = {'enter': '', 'exit': ''}
            elif videostatus['status'] == 'ENDED':
                if (fullscreen_temp['enter'] != ''):
                    fullscreen_temp['exit'] = videostatus['time']
                    fullscreen_list.append(fullscreen_temp)
                    fullscreen_temp = {'enter': '', 'exit': ''}           
        
        print fullscreen_list
        
        pre_video_id = ""
        video_order = 1
        
        for bell in json_data['user']['ratingbells']:
            ## End time is the time when bell rings
            endtime_iso = bell['time']
            # print endtime_iso
            endtime_iso_datetime = datetime.datetime.strptime(endtime_iso, "%Y-%m-%dT%H:%M:%S.%fZ")
            ## Start time is 30 sec before the end time
            starttime_iso_datetime = endtime_iso_datetime - datetime.timedelta(seconds=30)
            (dt, micro) = starttime_iso_datetime.strftime('%Y-%m-%dT%H:%M:%S.%f').split('.')
            starttime_iso = "%s.%03dZ" % (dt, int(micro) / 1000)
            # print starttime_iso
            
            endtime_video = bell['videoTime']
            # print endtime_video
            ## There is no stop in last 30 sec before the bell rings. 
            ## Since each time the video playing starts, they will ring the bell after 30 sec.
            starttime_video = str(float(endtime_video) - 30)
            # print starttime_video
            
            video_length = bell['videoDuration']
            # print video_length

            video_id = ""           
            if float(video_length) < 420:
                video_id = "Nuclear"  
            else:
                video_id = "Solar"
            
            if pre_video_id == "":
                video_order = 1
                pre_video_id = video_id
            elif video_id != pre_video_id:
                video_order = video_order + 1
                pre_video_id = video_id
            
            label = 0
            for rating in json_data['user']['ratings']:
                ratingtime_iso = rating['time']
                # print ratingtime_iso
                ratingtime_iso_datetime = datetime.datetime.strptime(ratingtime_iso, "%Y-%m-%dT%H:%M:%S.%fZ")
                if ratingtime_iso_datetime > endtime_iso_datetime and ratingtime_iso_datetime < endtime_iso_datetime + datetime.timedelta(seconds=10):
                    label = 1
                    break
            # print label          
            
            fullscreen_flag = 0
            for fullscreen_play in fullscreen_list:
                if (starttime_iso > fullscreen_play['enter']) and (endtime_iso < fullscreen_play['exit']):
                    fullscreen_flag = 1
                    break
            
            ## Add data into dataframe
            df_reports = df_reports.append({'id': id_str,
                                            'video_id': video_id,
                                            'video_order': video_order,
                                            'starttime_iso': starttime_iso, 
                                            'endtime_iso': endtime_iso,
                                            'starttime_video': starttime_video,
                                            'endtime_video': endtime_video,
                                            'video_length': video_length,
                                            'label': label,
                                            'fullscreen': fullscreen_flag
                                           }, 
                                           ignore_index=True)
print df_reports.head(10)
print df_reports.shape
print df_reports[df_reports['label'] == 1].shape
print df_reports[df_reports['label'] == 0].shape
print df_reports.video_order.values
print df_reports.video_id.values

Anon01
[]
Anon02
[{'exit': u'2017-04-12T11:28:33.178Z', 'enter': u'2017-04-12T11:20:41.610Z'}, {'exit': u'2017-04-12T11:40:20.780Z', 'enter': u'2017-04-12T11:33:36.025Z'}]
Anon03
[]
Anon04
[{'exit': u'2017-04-06T11:53:39.676Z', 'enter': u'2017-04-06T11:45:46.491Z'}, {'exit': u'2017-04-06T12:05:42.023Z', 'enter': u'2017-04-06T11:58:57.059Z'}]
Anon05
[]
Anon06
[]
Anon07
[{'exit': u'2017-04-13T12:45:32.365Z', 'enter': u'2017-04-13T12:38:52.824Z'}, {'exit': u'2017-04-13T12:59:46.088Z', 'enter': u'2017-04-13T12:51:58.970Z'}]
Anon08
[]
Anon09
[]
Anon10
[]
Anon11
[]
Anon12
[{'exit': u'2017-04-12T15:37:00.416Z', 'enter': u'2017-04-12T15:30:23.180Z'}, {'exit': u'2017-04-12T15:48:18.222Z', 'enter': u'2017-04-12T15:40:29.354Z'}]
Anon13
[{'exit': u'2017-04-12T14:32:31.031Z', 'enter': u'2017-04-12T14:24:43.184Z'}, {'exit': u'2017-04-12T14:43:27.772Z', 'enter': u'2017-04-12T14:36:46.908Z'}]
                endtime_iso       endtime_video  fullscreen      id  label  \
0  2017-04-12T09:28:47.772Z     

### 1.1 Reading Tobii Gaze Data
We only read some important data columns from the csv file.

In [4]:
# Data convert funtion

import pytz, datetime

def localtime_to_utc(localdate, localtimestamp):
    # local data format 3/15/2017 needs to be changed
    if (len(localdate.split("/")[0]) == 1):
        localdate = "0" + localdate
    timestring = localdate + " " + localtimestamp
    # print timestring
    local = pytz.timezone ('Europe/Amsterdam')
    naive = datetime.datetime.strptime(timestring, "%m/%d/%Y %H:%M:%S.%f")
    local_dt = local.localize(naive, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    # print utc_dt
    (dt, micro) = utc_dt.strftime('%Y-%m-%dT%H:%M:%S.%f').split('.')
    dt = "%s.%03dZ" % (dt, int(micro) / 1000)
    # print dt
    return dt

# # test
# localdate = '3/16/2017'
# localtimestamp = '10:41:51.388'
# utc_dt = localtime_to_utc(localdate,localtimestamp)
# print utc_dt

## Feature Selection

In [21]:
# Data format: id, starttime_iso, endtime_iso, feature 1, feature 2.......
import math
from scipy.stats import kurtosis
from scipy.stats import skew

df_features = pd.DataFrame()

for i in range(0, len(list_id_report)):
    # id
    id_str = list_id_report[i]
    df_reports_withid = df_reports.loc[df_reports['id'] == id_str]
    
    ## Read the tsv file based on id_str
    path_gazedata_Tobii = os.path.join(folderpath_tobiidata, id_str + ".csv")
    df_GazeData_Tobii = DataFrame.from_csv(path_gazedata_Tobii, sep=",")
    # print df_GazeData_Tobii.head(5)
    df_GazeData_Tobii = df_GazeData_Tobii.reset_index()
    
#     # print df_GazeData_Tobii.head(5)
#     ## Time convert
#     df_GazeData_Tobii['Timestamp_utc'] = df_GazeData_Tobii.apply(lambda row: localtime_to_utc(row['RecordingDate'], row['LocalTimeStamp']), axis=1)
#     df_GazeData_Tobii = df_GazeData_Tobii.drop('RecordingDate', 1)
#     df_GazeData_Tobii = df_GazeData_Tobii.drop('LocalTimeStamp', 1)
#     # print df_GazeData_Tobii.head(5)
    
#     ## Remove unnessesary data   
#     df_GazeData_Tobii = df_GazeData_Tobii[['Timestamp_utc',
#                                            'FixationIndex',
#                                            'GazeEventDuration',
#                                            'FixationPointX (MCSpx)',
#                                            'FixationPointY (MCSpx)',
#                                            'AbsoluteSaccadicDirection',
#                                            'GazePointX (ADCSpx)',
#                                            'GazePointY (ADCSpx)']]

#     print df_GazeData_Tobii.head(10)
    
    for index, row in df_reports_withid.iterrows():
        starttime_iso = row['starttime_iso']
#         print starttime_iso
        endtime_iso = row['endtime_iso']
#         print endtime_iso
        
        ## Select Data from df_GazeData_Tobii based on starttime_iso and endtime_iso
        df_GazeData_Tobii_selected = df_GazeData_Tobii.loc[((df_GazeData_Tobii['Timestamp_utc'] >= starttime_iso) &
                                                           (df_GazeData_Tobii['Timestamp_utc'] <= endtime_iso))
                                                          ]
        
        # print df_GazeData_Tobii_selected.head(20)
        # print df_GazeData_Tobii_selected.shape
        # print df_GazeData_Tobii_selected.columns
        
        ## Global Features: Feature Selection based on selected data
        temp_fixationindex = 0
        temp_timestamp = ""
        temp_FixationPointX = 0
        temp_FixationPointY = 0
        list_fixationduration = []
        list_saccadeduration = []
        list_saccadedistance = []
        list_saccadeangel = []
        
        ## Local Features: Feature Selection based on selected data
        
        # TODO: get the info about the video and fullscreen playing
        video_length = row['video_length']
        fullscreen_flag = row['fullscreen']
        
        face_topleft_x = 0
        face_topleft_y = 0
        face_bottomright_x = 0
        face_bottomright_y = 0
                
        slide_topleft_x = 0
        slide_topleft_y = 0
        slide_bottomright_x = 0
        slide_bottomright_y = 0
                
        subtitle_topleft_x = 0
        subtitle_topleft_y = 0
        subtitle_bottomright_x = 0
        subtitle_bottomright_y = 0
        
        # Select coordinate boundaries for the 
        if video_length <= 410: # Nucl should be replaced by video_id
            if fullscreen_flag:
                
                face_topleft_x = 1191
                face_topleft_y = 239
                face_bottomright_x = 1191+331
                face_bottomright_y = 239+280
                
                slide_topleft_x = 137
                slide_topleft_y = 132
                slide_bottomright_x = 137+929
                slide_bottomright_y = 132+557
                
                subtitle_topleft_x = 402
                subtitle_topleft_y = 892
                subtitle_bottomright_x = 402+1134
                subtitle_bottomright_y = 892+134
            
            else:
                
                face_topleft_x = 1089
                face_topleft_y = 297
                face_bottomright_x = 1089+139
                face_bottomright_y = 297+141
                
                slide_topleft_x = 558
                slide_topleft_y = 246
                slide_bottomright_x = 558+456
                slide_bottomright_y = 246+273
                
                subtitle_topleft_x = 721
                subtitle_topleft_y = 617
                subtitle_bottomright_x = 721+478
                subtitle_bottomright_y = 617+67
        
        elif video_length > 450: # Solar
            
            if fullscreen_flag:
                
                face_topleft_x = 501
                face_topleft_y = 189
                face_bottomright_x = 501+267
                face_bottomright_y = 189+260
                
                slide_topleft_x = 861
                slide_topleft_y = 313
                slide_bottomright_x = 861+811
                slide_bottomright_y = 313+483
                
                subtitle_topleft_x = 458
                subtitle_topleft_y = 900
                subtitle_bottomright_x = 458+1010
                subtitle_bottomright_y = 900+128
            
            else:
                
                face_topleft_x = 721
                face_topleft_y = 263
                face_bottomright_x = 721+146
                face_bottomright_y = 263+139
                
                slide_topleft_x = 913
                slide_topleft_y = 330
                slide_bottomright_x = 913+401
                slide_bottomright_y = 330+241
                
                subtitle_topleft_x = 709
                subtitle_topleft_y = 622
                subtitle_bottomright_x = 709+499
                subtitle_bottomright_y = 622+60
        
        ## Define a basic funtion for calculating whether fixations in aoi or not.
        def isinaoi(fixation_x, fixation_y):            
            if ((fixation_x >= face_topleft_x and fixation_x <= face_bottomright_x) and 
               (fixation_y >= face_topleft_y and fixation_y <= face_bottomright_y)):
                return "face"
            elif ((fixation_x >= subtitle_topleft_x and fixation_x <= subtitle_bottomright_x) and 
               (fixation_y >= subtitle_topleft_y and fixation_y <= subtitle_bottomright_y)):
                return "subtitle"
            elif ((fixation_x >= slide_topleft_x and fixation_x <= slide_bottomright_x) and 
               (fixation_y >= slide_topleft_y and fixation_y <= slide_bottomright_y)): 
                return "slide"
            else:
                return "out"
        
        ## Define local features
        
        # num of saccade jump from one area to another
        num_saccade_aoi_face_out2in = 0
        num_saccade_aoi_face_aoi2in = 0
        num_saccade_aoi_face_in2out = 0
        num_saccade_aoi_face_in2aoi = 0
        num_saccade_aoi_face_within = 0
        
        num_saccade_aoi_slide_out2in = 0
        num_saccade_aoi_slide_aoi2in = 0
        num_saccade_aoi_slide_in2out = 0
        num_saccade_aoi_slide_in2aoi = 0
        num_saccade_aoi_slide_within = 0
        
        num_saccade_aoi_subtitle_out2in = 0
        num_saccade_aoi_subtitle_aoi2in = 0
        num_saccade_aoi_subtitle_in2out = 0
        num_saccade_aoi_subtitle_in2aoi = 0
        num_saccade_aoi_subtitle_within = 0
        
        temp_aoi = "out"
        
        # numbers and durations of fixations in AOIs.
        list_duration_fixation_aoi_face = []
        list_duration_fixation_aoi_subtitle = []
        list_duration_fixation_aoi_slide = []
        # fixations out of AOIs
        list_duration_fixation_aoi_out = []
        
        for index, row in df_GazeData_Tobii_selected.iterrows():
            if np.isnan(row['FixationIndex']):
                continue
            
            if temp_fixationindex == 0:
                temp_fixationindex = row['FixationIndex']
                temp_timestamp = row['Timestamp_utc']
                temp_FixationPointX = row['FixationPointX (MCSpx)']
                temp_FixationPointY = row['FixationPointY (MCSpx)']
                
                list_fixationduration.append(row['GazeEventDuration'])
                list_saccadeangel.append(row['AbsoluteSaccadicDirection'])
                
                ## calculate local features
                current_aoi = isinaoi(row['FixationPointX (MCSpx)'], row['FixationPointY (MCSpx)'])
                if current_aoi == "face":
                    list_duration_fixation_aoi_face.append(row['GazeEventDuration'])
                elif current_aoi == "subtitle":
                    list_duration_fixation_aoi_subtitle.append(row['GazeEventDuration'])
                elif current_aoi == "slide":
                    list_duration_fixation_aoi_slide.append(row['GazeEventDuration'])
                else:
                    list_duration_fixation_aoi_out.append(row['GazeEventDuration'])                
                temp_aoi = current_aoi
            
            elif temp_fixationindex != row['FixationIndex']:
                
                # Global features
                temp_fixationindex = row['FixationIndex']
                list_fixationduration.append(row['GazeEventDuration'])
                list_saccadeangel.append(row['AbsoluteSaccadicDirection'])
                
                datetime_previous = datetime.datetime.strptime(temp_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
                datetime_current = datetime.datetime.strptime(row['Timestamp_utc'], "%Y-%m-%dT%H:%M:%S.%fZ")
                saccadeduration = datetime_current - datetime_previous
                list_saccadeduration.append(float(saccadeduration.total_seconds() * 1000))
                
                FixationPointX_current = row['FixationPointX (MCSpx)']
                FixationPointY_current = row['FixationPointY (MCSpx)']
                saccadedistance = math.sqrt(math.pow((FixationPointX_current - temp_FixationPointX), 2) + 
                                            math.pow((FixationPointY_current - temp_FixationPointY), 2))
                list_saccadedistance.append(saccadedistance)
                
                temp_timestamp = row['Timestamp_utc']
                temp_FixationPointX = row['FixationPointX (MCSpx)']
                temp_FixationPointY = row['FixationPointY (MCSpx)']
                
                # Local features
                current_aoi = isinaoi(row['FixationPointX (MCSpx)'], row['FixationPointY (MCSpx)'])
                
                if current_aoi == "face":
                    
                    list_duration_fixation_aoi_face.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "face":
                        num_saccade_aoi_face_within = num_saccade_aoi_face_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_face_out2in = num_saccade_aoi_face_out2in + 1
                    else:
                        num_saccade_aoi_face_aoi2in = num_saccade_aoi_face_aoi2in + 1
                        if temp_aoi == "slide":
                            num_saccade_aoi_slide_in2aoi = num_saccade_aoi_slide_in2aoi + 1
                        else:
                            num_saccade_aoi_subtitle_in2aoi = num_saccade_aoi_subtitle_in2aoi + 1
                        
                elif current_aoi == "subtitle":
                    
                    list_duration_fixation_aoi_subtitle.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "subtitle":
                        num_saccade_aoi_subtitle_within = num_saccade_aoi_subtitle_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_subtitle_out2in = num_saccade_aoi_subtitle_out2in + 1
                    else:
                        num_saccade_aoi_subtitle_aoi2in = num_saccade_aoi_subtitle_aoi2in + 1
                        if temp_aoi == "face":
                            num_saccade_aoi_face_in2aoi = num_saccade_aoi_face_in2aoi + 1
                        else:
                            num_saccade_aoi_slide_in2aoi = num_saccade_aoi_slide_in2aoi + 1
                    
                elif current_aoi == "slide":
                    list_duration_fixation_aoi_slide.append(row['GazeEventDuration'])
                    
                    if temp_aoi == "slide":
                        num_saccade_aoi_slide_within = num_saccade_aoi_slide_within + 1
                    elif temp_aoi == "out":
                        num_saccade_aoi_slide_out2in = num_saccade_aoi_slide_out2in + 1
                    else:
                        num_saccade_aoi_subtitle_aoi2in = num_saccade_aoi_subtitle_aoi2in + 1
                        if temp_aoi == "face":
                            num_saccade_aoi_face_in2aoi = num_saccade_aoi_face_in2aoi + 1
                        else:
                            num_saccade_aoi_subtitle_in2aoi = num_saccade_aoi_subtitle_in2aoi + 1
                    
                else:
                    list_duration_fixation_aoi_out.append(row['GazeEventDuration'])
                    if temp_aoi == "slide":
                        num_saccade_aoi_slide_in2out = num_saccade_aoi_slide_in2out + 1
                    elif temp_aoi == "face":
                        num_saccade_aoi_face_in2out = num_saccade_aoi_face_in2out + 1
                    elif temp_aoi == "subtitle":
                        num_saccade_aoi_subtitle_in2out = num_saccade_aoi_subtitle_in2out + 1
                 
                temp_aoi = current_aoi
        
        num_saccade_horizon = sum(1 for i in list_saccadeangel if ((i <= 30 and i >= -30) or (i >= 150 and i <= 210) or (i >= 330)))
#         print num_saccade_horizon
#         print len(list_fixationduration)
#         print len(list_saccadeduration)
#         print len(list_saccadedistance)
#         print len(list_saccadeangel)
#         print len(list_duration_fixation_aoi_face)
#         print len(list_duration_fixation_aoi_subtitle)
#         print len(list_duration_fixation_aoi_slide)
#         print len(list_duration_fixation_aoi_out)
        
        duration_fixation_aoi_face = 0
        duration_fixation_aoi_face_max = 0
        duration_fixation_aoi_subtitle = 0
        duration_fixation_aoi_subtitle_max = 0
        duration_fixation_aoi_slide = 0
        duration_fixation_aoi_slide_max = 0
        duration_fixation_aoi_out = 0
        duration_fixation_aoi_out_max = 0 
        
        if len(list_duration_fixation_aoi_face) != 0:
            duration_fixation_aoi_face = sum(list_duration_fixation_aoi_face)/sum(list_fixationduration)
            duration_fixation_aoi_face_max = np.max(list_duration_fixation_aoi_face)
        if len(list_duration_fixation_aoi_subtitle) != 0:
            duration_fixation_aoi_subtitle = sum(list_duration_fixation_aoi_subtitle)/sum(list_fixationduration)
            duration_fixation_aoi_subtitle_max = np.max(list_duration_fixation_aoi_subtitle)
        if len(list_duration_fixation_aoi_slide) != 0:
            duration_fixation_aoi_slide = sum(list_duration_fixation_aoi_slide)/sum(list_fixationduration)
            duration_fixation_aoi_slide_max = np.max(list_duration_fixation_aoi_slide)
        if len(list_duration_fixation_aoi_out) != 0:
            duration_fixation_aoi_out = sum(list_duration_fixation_aoi_out)/sum(list_fixationduration)
            duration_fixation_aoi_out_max = np.max(list_duration_fixation_aoi_out)
        
        ## Add features into df_features
        df_features = df_features.append({
                'id': id_str, 
                'starttime_iso': starttime_iso, 
                'endtime_iso': endtime_iso,
                'fixationduration_min': np.min(list_fixationduration),
                'fixationduration_max': np.max(list_fixationduration),
                'fixationduration_mean': np.mean(list_fixationduration),
                'fixationduration_median': np.median(list_fixationduration),
                'fixationduration_stddev': np.std(list_fixationduration),
                'fixationduration_range': np.max(list_fixationduration) - np.min(list_fixationduration),
                'fixationduration_kurtosis': kurtosis(list_fixationduration),
                'fixationduration_skew': skew(list_fixationduration),
                'saccadeduration_min': np.min(list_saccadeduration),
                'saccadeduration_max': np.max(list_saccadeduration),
                'saccadeduration_mean': np.mean(list_saccadeduration),
                'saccadeduration_median': np.median(list_saccadeduration),
                'saccadeduration_stddev': np.std(list_saccadeduration),
                'saccadeduration_range': np.max(list_saccadeduration) - np.min(list_saccadeduration),
                'saccadeduration_kurtosis': kurtosis(list_saccadeduration),
                'saccadeduration_skew': skew(list_saccadeduration),
                'saccadedistance_min': np.min(list_saccadedistance),
                'saccadedistance_max': np.max(list_saccadedistance),
                'saccadedistance_mean': np.mean(list_saccadedistance),
                'saccadedistance_median': np.median(list_saccadedistance),
                'saccadedistance_stddev': np.std(list_saccadedistance),
                'saccadedistance_range': np.max(list_saccadedistance) - np.min(list_saccadedistance),
                'saccadedistance_kurtosis': kurtosis(list_saccadedistance),
                'saccadedistance_skew': skew(list_saccadedistance),
                'saccadeangel_min': np.min(list_saccadeangel),
                'saccadeangel_max': np.max(list_saccadeangel),
                'saccadeangel_mean': np.mean(list_saccadeangel),
                'saccadeangel_median': np.median(list_saccadeangel),
                'saccadeangel_stddev': np.std(list_saccadeangel),
                'saccadeangel_range': np.max(list_saccadeangel) - np.min(list_saccadeangel),
                'saccadeangel_kurtosis': kurtosis(list_saccadeangel),
                'saccadeangel_skew': skew(list_saccadeangel),
                'saccade_num': len(list_saccadeduration),
                'saccade_horizonratio': num_saccade_horizon/len(list_saccadeangel),
                'fixation_saccade_ratio': sum(list_fixationduration)/sum(list_saccadeduration),
                ## LOCAL FEATURES
                'num_saccade_aoi_face_out2in': num_saccade_aoi_face_out2in/len(list_saccadeduration),
                'num_saccade_aoi_face_aoi2in': num_saccade_aoi_face_aoi2in/len(list_saccadeduration),
                'num_saccade_aoi_face_in2out': num_saccade_aoi_face_in2out/len(list_saccadeduration),
                'num_saccade_aoi_face_in2aoi': num_saccade_aoi_face_in2aoi/len(list_saccadeduration),
                'num_saccade_aoi_face_within': num_saccade_aoi_face_within/len(list_saccadeduration),
                'num_saccade_aoi_slide_out2in': num_saccade_aoi_slide_out2in/len(list_saccadeduration),
                'num_saccade_aoi_slide_aoi2in': num_saccade_aoi_slide_aoi2in/len(list_saccadeduration),
                'num_saccade_aoi_slide_in2out': num_saccade_aoi_slide_in2out/len(list_saccadeduration),
                'num_saccade_aoi_slide_in2aoi': num_saccade_aoi_slide_in2aoi/len(list_saccadeduration),
                'num_saccade_aoi_slide_within': num_saccade_aoi_slide_within/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_out2in': num_saccade_aoi_subtitle_out2in/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_aoi2in': num_saccade_aoi_subtitle_aoi2in/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_in2out': num_saccade_aoi_subtitle_in2out/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_in2aoi': num_saccade_aoi_subtitle_in2aoi/len(list_saccadeduration),
                'num_saccade_aoi_subtitle_within': num_saccade_aoi_subtitle_within/len(list_saccadeduration),
                'duration_fixation_aoi_face': duration_fixation_aoi_face,
                'duration_fixation_aoi_face_max': duration_fixation_aoi_face_max,
                'duration_fixation_aoi_subtitle': duration_fixation_aoi_subtitle,
                'duration_fixation_aoi_subtitle_max': duration_fixation_aoi_subtitle_max,
                'duration_fixation_aoi_slide': duration_fixation_aoi_slide,
                'duration_fixation_aoi_slide_max': duration_fixation_aoi_slide_max,
                'duration_fixation_aoi_out': duration_fixation_aoi_out,
                'duration_fixation_aoi_out_max': duration_fixation_aoi_out_max
            }, ignore_index=True)
        
        # print df_features.head(1)
print df_features.shape

(200, 61)


In [22]:
## Merge features with labels
df_merge = pd.merge(df_reports, df_features)
df_merge.to_csv("features_labels_tobii.csv", index=False)

In [23]:
print df_merge.columns
print df_merge.columns[10:18]
print df_merge.columns[18:27]
print df_merge.columns[27:42]
print df_merge.columns[42:]
print len(df_merge.columns)

Index([u'endtime_iso', u'endtime_video', u'fullscreen', u'id', u'label',
       u'starttime_iso', u'starttime_video', u'video_id', u'video_length',
       u'video_order', u'duration_fixation_aoi_face',
       u'duration_fixation_aoi_face_max', u'duration_fixation_aoi_out',
       u'duration_fixation_aoi_out_max', u'duration_fixation_aoi_slide',
       u'duration_fixation_aoi_slide_max', u'duration_fixation_aoi_subtitle',
       u'duration_fixation_aoi_subtitle_max', u'fixation_saccade_ratio',
       u'fixationduration_kurtosis', u'fixationduration_max',
       u'fixationduration_mean', u'fixationduration_median',
       u'fixationduration_min', u'fixationduration_range',
       u'fixationduration_skew', u'fixationduration_stddev',
       u'num_saccade_aoi_face_aoi2in', u'num_saccade_aoi_face_in2aoi',
       u'num_saccade_aoi_face_in2out', u'num_saccade_aoi_face_out2in',
       u'num_saccade_aoi_face_within', u'num_saccade_aoi_slide_aoi2in',
       u'num_saccade_aoi_slide_in2aoi', u'num

## Prediction With Global Features and Local Features

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l1_total = []
y_pred_l2_total = []
y_pred_svc_total = []
y_pred_tree_total = []
y_pred_bayes_total = []
y_pred_rf_total = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    # print id_str
    # print "Run: " + str(i)
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # print data_test.shape
    
    X_train = data_train.ix[:, 10:].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, 10:].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    clf_l1_LR = LogisticRegression(penalty='l1', tol=0.01)
    clf_l1_LR = clf_l1_LR.fit(X_train, y_train)
    y_pred_l1 = clf_l1_LR.predict(X_test)
    y_pred_l1_total.extend(y_pred_l1)
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
        
    ## SVM with unbalanced class weight
    clf_SVC = SVC(tol=0.01)
    clf_SVC = clf_SVC.fit(X_train, y_train)
    y_pred_svc = clf_SVC.predict(X_test)
    y_pred_svc_total.extend(y_pred_svc)
    
    ## Decision tree
    clf_tree = DecisionTreeClassifier()
    clf_tree = clf_tree.fit(X_train, y_train)
    y_pred_tree = clf_tree.predict(X_test)
    y_pred_tree_total.extend(y_pred_tree)
    
    
    ## GaussianNB
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    
#     ## Random Forest
#     clf_rf = RandomForestClassifier()
#     clf_rf = clf_rf.fit(X_train, y_train)
#     y_pred_rf = clf_rf.predict(X_test)
#     y_pred_rf_total.extend(y_pred_rf)
    

print "L1 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l1_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l1_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l1_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l1_total))

print "L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))

print "SVC"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_svc_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_svc_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_svc_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_svc_total))

print "Decision Tree"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_tree_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_tree_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_tree_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_tree_total))

print "Gaussian Naive Bayes"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))


L1 Logistic Regression
	Precision: 0.370
	Recall: 0.172
	F1: 0.235
	Accuracy: 0.675

L2 Logistic Regression
	Precision: 0.300
	Recall: 0.155
	F1: 0.205
	Accuracy: 0.650

SVC
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.710

Decision Tree
	Precision: 0.324
	Recall: 0.379
	F1: 0.349
	Accuracy: 0.590

Gaussian Naive Bayes
	Precision: 0.294
	Recall: 0.345
	F1: 0.317
	Accuracy: 0.570



## Prediction With Local Features

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l1_total = []
y_pred_l2_total = []
y_pred_svc_total = []
y_pred_tree_total = []
y_pred_bayes_total = []
y_pred_rf_total = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    # print id_str
    # print "Run: " + str(i)
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # print data_test.shape
    feature_index_local = [10,11,12,13,14,15,16,17,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41]
    X_train = data_train.ix[:, feature_index_local].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, feature_index_local].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    # for i, C in enumerate((100, 1, 0.01)):
    # turn down tolerance for short training time
    clf_l1_LR = LogisticRegression(penalty='l1', tol=0.01)
    clf_l1_LR = clf_l1_LR.fit(X_train, y_train)
    y_pred_l1 = clf_l1_LR.predict(X_test)
    y_pred_l1_total.extend(y_pred_l1)
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
        
    ## SVM with unbalanced class weight
    clf_SVC = SVC(class_weight = {0:1, 1:3})
    clf_SVC = clf_SVC.fit(X_train, y_train)
    y_pred_svc = clf_SVC.predict(X_test)
    y_pred_svc_total.extend(y_pred_svc)
    
    ## Decision tree
    clf_tree = DecisionTreeClassifier()
    clf_tree = clf_tree.fit(X_train, y_train)
    y_pred_tree = clf_tree.predict(X_test)
    y_pred_tree_total.extend(y_pred_tree)
    
    
    ## GaussianNB
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    

print "L1 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l1_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l1_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l1_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l1_total))

print "L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))

print "SVC"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_svc_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_svc_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_svc_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_svc_total))

print "Decision Tree"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_tree_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_tree_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_tree_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_tree_total))

print "Gaussian Naive Bayes"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))

L1 Logistic Regression
	Precision: 0.143
	Recall: 0.017
	F1: 0.031
	Accuracy: 0.685

L2 Logistic Regression
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.710

SVC
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.710

Decision Tree
	Precision: 0.355
	Recall: 0.379
	F1: 0.367
	Accuracy: 0.620

Gaussian Naive Bayes
	Precision: 0.448
	Recall: 0.224
	F1: 0.299
	Accuracy: 0.695



## Prediction With Global Features

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l1_total = []
y_pred_l2_total = []
y_pred_svc_total = []
y_pred_tree_total = []
y_pred_bayes_total = []
y_pred_rf_total = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    # print id_str
    # print "Run: " + str(i)
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # print data_test.shape
    feature_index_global = [18,19,20,21,22,23,24,25,26,
                            42,43,44,45,46,47,48,49,50,
                            51,52,53,54,55,56,57,58,59,
                            60,61,62,63,64,65,66,67]
    X_train = data_train.ix[:, feature_index_global].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
#     sm = SMOTE(random_state=48)
#     X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, feature_index_global].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    clf_l1_LR = LogisticRegression(penalty='l1', tol=0.01)
    clf_l1_LR = clf_l1_LR.fit(X_train, y_train)
    y_pred_l1 = clf_l1_LR.predict(X_test)
    y_pred_l1_total.extend(y_pred_l1)
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
        
    ## SVM with unbalanced class weight
    clf_SVC = SVC(class_weight = {0:1, 1:3})
    clf_SVC = clf_SVC.fit(X_train, y_train)
    y_pred_svc = clf_SVC.predict(X_test)
    y_pred_svc_total.extend(y_pred_svc)
    
    ## Decision tree
    clf_tree = DecisionTreeClassifier()
    clf_tree = clf_tree.fit(X_train, y_train)
    y_pred_tree = clf_tree.predict(X_test)
    y_pred_tree_total.extend(y_pred_tree)
    
    
    ## GaussianNB
    clf_GaussianNB = GaussianNB()
    clf_GaussianNB = clf_GaussianNB.fit(X_train, y_train)
    y_pred_bayes = clf_GaussianNB.predict(X_test)
    y_pred_bayes_total.extend(y_pred_bayes)
    
#     ## Random Forest
#     clf_rf = RandomForestClassifier()
#     clf_rf = clf_rf.fit(X_train, y_train)
#     y_pred_rf = clf_rf.predict(X_test)
#     y_pred_rf_total.extend(y_pred_rf)
    

print "L1 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l1_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l1_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l1_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l1_total))

print "L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))

print "SVC"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_svc_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_svc_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_svc_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_svc_total))

print "Decision Tree"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_tree_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_tree_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_tree_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_tree_total))

print "Gaussian Naive Bayes"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_bayes_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_bayes_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_bayes_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_bayes_total))

L1 Logistic Regression
	Precision: 0.419
	Recall: 0.224
	F1: 0.292
	Accuracy: 0.685

L2 Logistic Regression
	Precision: 0.300
	Recall: 0.155
	F1: 0.205
	Accuracy: 0.650

SVC
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.710

Decision Tree
	Precision: 0.218
	Recall: 0.207
	F1: 0.212
	Accuracy: 0.555

Gaussian Naive Bayes
	Precision: 0.309
	Recall: 0.362
	F1: 0.333
	Accuracy: 0.580



## 1.1 Whether or not predictions can be made equally well for all participants?

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l2_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    print id_str
    print "Run: " + str(i)
    data_train = df_merge.ix[df_merge['id'] != id_str]
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # print data_test.shape
    feature_index_global = [18,19,20,21,22,23,24,25,26,
                            42,43,44,45,46,47,48,49,50,
                            51,52,53,54,55,56,57,58,59,
                            60,61,62,63,64,65,66,67]
    X_train = data_train.ix[:, feature_index_global].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
    sm = SMOTE(random_state=48)
    X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, feature_index_global].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
    
    print("\tPrecision: %1.3f" % precision_score(y_test, y_pred_l2))
    print("\tRecall: %1.3f" % recall_score(y_test, y_pred_l2))
    print("\tF1: %1.3f" % f1_score(y_test, y_pred_l2))
    print("\tAccuracy: %1.3f\n" % accuracy_score(y_test, y_pred_l2))
    
    list_pre.append(precision_score(y_test, y_pred_l2))
    list_rec.append(recall_score(y_test, y_pred_l2))
    list_f1.append(f1_score(y_test, y_pred_l2))

print "Final Results: L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))
print("Precision")
print np.max(list_pre)
print np.min(list_pre)
print np.mean(list_pre)
print np.median(list_pre)
print np.std(list_pre)
print("Recall")
print np.max(list_rec)
print np.min(list_rec)
print np.mean(list_rec)
print np.median(list_rec)
print np.std(list_rec)
print("F1")
print np.max(list_f1)
print np.min(list_f1)
print np.mean(list_f1)
print np.median(list_f1)
print np.std(list_f1)

Anon01
Run: 0
	Precision: 0.333
	Recall: 0.800
	F1: 0.471
	Accuracy: 0.438

Anon02
Run: 1
	Precision: 0.500
	Recall: 0.429
	F1: 0.462
	Accuracy: 0.588

Anon03
Run: 2
	Precision: 0.714
	Recall: 0.833
	F1: 0.769
	Accuracy: 0.800

Anon04
Run: 3
	Precision: 0.500
	Recall: 0.250
	F1: 0.333
	Accuracy: 0.733

Anon05
Run: 4
	Precision: 0.333
	Recall: 0.250
	F1: 0.286
	Accuracy: 0.667

Anon06
Run: 5
	Precision: 0.000
	Recall: 0.000
	F1: 0.000
	Accuracy: 0.733

Anon07
Run: 6
	Precision: 0.100
	Recall: 1.000
	F1: 0.182
	Accuracy: 0.357

Anon08
Run: 7
	Precision: 0.167
	Recall: 0.500
	F1: 0.250
	Accuracy: 0.600

Anon09
Run: 8
	Precision: 0.500
	Recall: 0.111
	F1: 0.182
	Accuracy: 0.438

Anon10
Run: 9
	Precision: 0.333
	Recall: 0.800
	F1: 0.471
	Accuracy: 0.438

Anon11
Run: 10
	Precision: 1.000
	Recall: 0.556
	F1: 0.714
	Accuracy: 0.750

Anon12
Run: 11
	Precision: 0.333
	Recall: 0.750
	F1: 0.462
	Accuracy: 0.533

Anon13
Run: 12
	Precision: 0.250
	Recall: 1.000
	F1: 0.400
	Accuracy: 0.800

Final Res

## 1.2 whether or not predictions can be made well across the entire length of the lecture videos?

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l2_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    # data_train = df_merge.ix[df_merge['id'] != id_str]
    # data_train = data_train.ix[data_train['video_id'] == "Solar"]
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    # data_train = df_merge
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    data_train = df_merge
    data_train = data_train.ix[((data_train['video_id'] != "Solar")&
                                (data_test['endtime_video']/data_test['video_length'] < 0.5))|
                               (data_train['video_id'] == "Solar")
                              ]
    
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    data_test = data_test.ix[(data_test['video_id'] != "Solar")&(data_test['endtime_video']/data_test['video_length'] >= 0.5)]
    # print data_test.shape
    feature_index_global = [18,19,20,21,22,23,24,25,26,
                            42,43,44,45,46,47,48,49,50,
                            51,52,53,54,55,56,57,58,59,
                            60,61,62,63,64,65,66,67]
    X_train = data_train.ix[:, feature_index_global].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
    sm = SMOTE(random_state=48)
    X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, feature_index_global].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
    
    list_pre.append(precision_score(y_test, y_pred_l2))
    list_rec.append(recall_score(y_test, y_pred_l2))
    list_f1.append(f1_score(y_test, y_pred_l2))

print "Final Results: L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))

Final Results: L2 Logistic Regression
	Precision: 0.526
	Recall: 0.588
	F1: 0.556
	Accuracy: 0.673



## 1.3 whether or not a model trained on one video translates to good predictions in other videos?

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score)
## Read data from features_labels_tobii.csv

# df_merge = pd.read_csv("features_labels_tobii.csv")
df_merge = pd.read_csv("features_labels_tobii.csv")
list_id_report = list(df_merge['id'].unique())

y_test_total = []
y_pred_l2_total = []

list_pre = []
list_rec = []
list_f1 = []

## Leave-one-out machine learning methods (try Logistic Regression first)
for i in range(0, len(list_id_report)):
    id_str = list_id_report[i]
    
    # data_train = df_merge.ix[df_merge['id'] != id_str]
    # data_train = data_train.ix[data_train['video_id'] == "Solar"]
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    # data_train = df_merge
    # data_train = data_train.ix[data_train['video_id'] != "Solar"]
    
    data_train = df_merge
    data_train = data_train.ix[data_train['video_id'] == "Solar"]
    
    # print data_train.shape
    data_test = df_merge.ix[df_merge['id'] == id_str]
    # data_test = data_test.ix[data_test['video_id'] == "Solar"]
    data_test = data_test.ix[data_test['video_id'] != "Solar"]
    # print data_test.shape
    feature_index_global = [18,19,20,21,22,23,24,25,26,
                            42,43,44,45,46,47,48,49,50,
                            51,52,53,54,55,56,57,58,59,
                            60,61,62,63,64,65,66,67]
    X_train = data_train.ix[:, feature_index_global].fillna(value=0)
    y_train = data_train.ix[:, 4]
    
    sm = SMOTE(random_state=48)
    X_train, y_train = sm.fit_sample(X_train, y_train)
    
    X_test = data_test.ix[:, feature_index_global].fillna(value=0)
    y_test = data_test.ix[:, 4]
    
    y_test_total.extend(y_test)
    
    ## Logistic Regression
    
    clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01)
    clf_l2_LR = clf_l2_LR.fit(X_train, y_train)
    y_pred_l2 = clf_l2_LR.predict(X_test)
    y_pred_l2_total.extend(y_pred_l2)
    
    list_pre.append(precision_score(y_test, y_pred_l2))
    list_rec.append(recall_score(y_test, y_pred_l2))
    list_f1.append(f1_score(y_test, y_pred_l2))

print "Final Results: L2 Logistic Regression"
print("\tPrecision: %1.3f" % precision_score(y_test_total, y_pred_l2_total))
print("\tRecall: %1.3f" % recall_score(y_test_total, y_pred_l2_total))
print("\tF1: %1.3f" % f1_score(y_test_total, y_pred_l2_total))
print("\tAccuracy: %1.3f\n" % accuracy_score(y_test_total, y_pred_l2_total))

Final Results: L2 Logistic Regression
	Precision: 0.444
	Recall: 0.593
	F1: 0.508
	Accuracy: 0.659

