# XCALE Master thesis : Dataset Building

In this notebook, we build the dbn dataset

<style>.table {margin-left:0px}</style>

## Import

In [None]:
# data handling
import numpy as np
import pandas as pd
from pylab import rcParams
rcParams['figure.figsize'] = 18, 10

In [None]:
# Database
#!sudo apt-get install python3-dev default-libmysqlclient-dev
!pip install mysql-connector-python
from mysql.connector import connect
# error
import traceback
from collections import defaultdict
from datetime import time, timedelta, datetime
# utilitaries
import functools

Collecting mysql-connector-python
[?25l  Downloading https://files.pythonhosted.org/packages/ef/5b/a7dc32e711e4a065896188afef6864489ccf4bdab0928581c4262e84110d/mysql_connector_python-8.0.25-cp37-cp37m-manylinux1_x86_64.whl (25.4MB)
[K     |████████████████████████████████| 25.4MB 124kB/s 
Installing collected packages: mysql-connector-python
Successfully installed mysql-connector-python-8.0.25


## Database

In [None]:
# Log
_db_config = {
    'user': 'cajuge',
    'password': 'Kz1773qMWIVhRZUZ',
    'host': 'franceioi.cinniket56wn.eu-central-1.rds.amazonaws.com',
    'database': 'srl',
    'port':'3306'
}

In [None]:
_tables = ["clavier", "modification",
          "navigation", "pas_a_pas",
          "souris","srl_final_prompt",
          "srl_initial_prompt","srl_prompt","validation", "connexion"]

In [None]:
def get_connection(config):
    return connect(**config)
def close_connection(connection):
    connection.close()

In [None]:
def getTablesFromDB(min_id_connection, _tables):
    dataframe_tables = {}
    mysql_connection = get_connection(_db_config)
    for table in _tables:
        with mysql_connection.cursor() as cursor:
            try:
                # ----------- Get every connection / participant ---------- #
                filter_column = "id" if table == "connexion" else "id_connexion"
                query_select = "SELECT * FROM {} WHERE {} >= %s ORDER BY {}, timestamp".format(table, filter_column, filter_column)
                cursor.execute(query_select, (min_id_connection,))
                result = cursor.fetchall()
                columns_name = np.array(
                    [cursor.description[i][0] for i in range(len(cursor.description)) if cursor.description[i][0]])
                data = []
                for row in result:
                    data.append(np.array([row[i] for i in range(len(row))]))
                dataframe_tables[table] = pd.DataFrame(data, columns=columns_name)
            except Exception:
                traceback.print_exc()
    
    
    return dataframe_tables

def getTablesFromDB_ids(ids, _tables):
    dataframe_tables = {}
    mysql_connection = get_connection(_db_config)
    for table in _tables:
        with mysql_connection.cursor() as cursor:
            try:
                # ----------- Get every connection / participant ---------- #
                filter_column = "id" if table == "connexion" else "id_connexion"
                format_strings = ','.join(['%s'] * len(ids))
                query_select = "SELECT * FROM {} WHERE {} IN ({}) ORDER BY {}, timestamp".format(table, filter_column, format_strings, filter_column)
                cursor.execute(query_select, tuple(ids))
                result = cursor.fetchall()
                columns_name = np.array(
                    [cursor.description[i][0] for i in range(len(cursor.description)) if cursor.description[i][0]])
                data = []
                for row in result:
                    data.append(np.array([row[i] for i in range(len(row))]))
                dataframe_tables[table] = pd.DataFrame(data, columns=columns_name)
            except Exception:
                traceback.print_exc()
    
    
    return dataframe_tables

In [None]:
dataframe_tables = getTablesFromDB(191, _tables)

In [None]:
table = "connexion"
sujet_range = list(range(20,29))
connections = dataframe_tables[table]
nb_connection_1h = []
for j, id_connexion in enumerate(connections.id):
  last_timestamp_table = []
  for table in [i for i in _tables if i != "connexion"]:
    sequence = dataframe_tables[table][dataframe_tables[table].id_connexion == id_connexion].timestamp
    if len(sequence) != 0:
      last_timestamp_table.append(max(sequence))
  if len(last_timestamp_table) != 0:
    if max(last_timestamp_table) - connections.timestamp[j] >= timedelta(hours=1):
      temp = list(dataframe_tables["validation"].query("id_connexion == {} and experimentation == 0".format(id_connexion)).id_sujet)
      if len(temp) != 0:
        if temp[0] in sujet_range:
          nb_connection_1h.append(id_connexion)
print("{} registered connections which last at least 1 hour".format(len(nb_connection_1h)))

91 registered connections which last at least 1 hour


In [None]:
dataframe_tables = getTablesFromDB_ids(nb_connection_1h, _tables)

## Building V.1

<img src='https://drive.google.com/uc?id=1tjCCTLjmZA4g2kHQitec5FeciDanQzT7' />

### Beginning To Middle Prompt or 30 Minutes

In [None]:
data = [[] for _ in nb_connection_1h]
columns = []

In [None]:
df = dataframe_tables.copy()

#### Prompts

In [None]:
def remap_prompt_initial(df, key_map, column, nb_connection_1h, data, replace=True):
  df_temp = df["srl_initial_prompt"].query("id_connexion in {}".format(nb_connection_1h))
  if replace:
    df_temp = df_temp.replace({column:key_map})
  for i,id in enumerate(nb_connection_1h):
    df_temp2 = df_temp.query("id_connexion == {}".format(id))
    if df_temp2.shape[0] == 0:
      data[i].append(np.nan)
    else:
      data[i].append(list(df_temp2[column])[0])
  return data

def remap_prompt(df, key_map, column, nb_connection_1h, data, replace=True):
  df_temp = df["srl_prompt"].query("id_connexion in {}".format(nb_connection_1h))
  if replace:
    df_temp = df_temp.replace({column:key_map})
  for i,id in enumerate(nb_connection_1h):
    df_temp2 = df_temp.query("id_connexion == {}".format(id))
    if df_temp2.shape[0] == 0:
      data[i].append(np.nan)
    else:
      data[i].append(list(df_temp2[column])[0])
  return data

##### quickpi

In [None]:
key_map = {'Jamais':0, 'Une fois':1, 'De nombreuses fois':2}
data = remap_prompt_initial(df, key_map, "quickpi", nb_connection_1h, data)
columns.append("Already_used_QuickPi")

##### experience

In [None]:
key_map = {'Jamais':0, 'Une fois':1, 'De nombreuses fois':2}
data = remap_prompt_initial(df, key_map, "experience", nb_connection_1h, data)
columns.append("Already_progammed")

##### Home or School

In [None]:
key_map = {'Je suis en classe':'school', 'Je suis à la maison':'Home'}
data = remap_prompt_initial(df, key_map, "lieu", nb_connection_1h, data)
columns.append("Home_or_School")

##### Extern Help frequence

In [None]:
key_map = {'Je suis en autonomie totale':0,'Très rarement':1,'De temps en temps':2,'Très fréquemment':3,'Fréquemment':4}
data = remap_prompt_initial(df, key_map, "aide", nb_connection_1h, data)
columns.append("Extern_Help_frequence")

##### Reason

In [None]:
key_map = {}
data = remap_prompt_initial(df, key_map, "raison", nb_connection_1h, data, replace=False)
columns.append("Reason")

##### motivation

In [None]:
key_map = {'Pas du tout':0,'Pas trop':1,'Moyennement':2,'Plutôt oui':3,'Oui beaucoup':4}
data = remap_prompt(df, key_map, "motivation", nb_connection_1h, data, replace=False)
columns.append("Motivation")

##### Goal Reach

In [None]:
key_map = {'Pas du tout':0,'Pas trop':1,'Moyennement':2,'Plutôt oui':3,'Oui beaucoup':4}
data = remap_prompt(df, key_map, "objectif", nb_connection_1h, data, replace=False)
columns.append("Goal_Reach_middle")

##### Time Management

In [None]:
key_map = {'Très mal':0,'Mal':1,'Ni bien, ni mal':2,'Bien':3,'Très bien':4}
data = remap_prompt(df, key_map, "temps", nb_connection_1h, data, replace=False)
columns.append("Time_Management_middle")

##### Goal Type

In [None]:
key_map = {"Je n'ai pas d'objectif":"None", "Avoir le meilleur score":"Competitive", "Avoir une bonne note pour mon cours":"Imposed", 
           "Je veux m'amuser":"Personal", "Progresser en programmation":"Learning", "Apprendre à programmer un objet électronique":"Learning"}
data_1 = [[] for _ in nb_connection_1h]
data_1 = remap_prompt_initial(df, key_map, "objectif", nb_connection_1h, data_1)
data_2 = [[] for _ in nb_connection_1h]
data_2 = remap_prompt(df, key_map, "new_objectif", nb_connection_1h, data_2)
columns.append("Goal_Type")

In [None]:
def find_objectif(data_1, data_2, data):
  for i in range(len(data_1)):
    if data_1[i][0] == "None":
      data[i].append(data_2[i][0])
    else:
      data[i].append(data_1[i][0])
  return data

In [None]:
data = find_objectif(data_1, data_2, data)

#### temp

In [None]:
new_df = pd.DataFrame(data, columns=columns, index=nb_connection_1h)
new_df

Unnamed: 0,Already_used_QuickPi,Already_progammed,Home_or_School,Extern_Help_frequence,Reason,Motivation,Goal_Reach_middle,Time_Management_middle,Goal_Type
191,,,,,,,,,
225,,,,,,,,,
234,,,,,,,,,
240,,,,,,,,,
242,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
3218,,,,,,,,,
3269,,,,,,,,,
3357,,,,,,,,,
3987,,,,,,,,,


#### Dynamic Traces

In [None]:
def get_important_timestamps(ids_connexion, df_prompt_middle, df_prompt_final, df_connexion):
  timestamps_middle = []
  timestamps_final = []
  for id_connexion in ids_connexion:
    temp_df = df_prompt_middle.query('id_connexion == {}'.format(id_connexion))
    if temp_df.shape[0] == 0:
      # the user doesn't not provide us prompts
      timestamp = list(df_connexion.query('id == {}'.format(id_connexion)).timestamp)[0]
      timestamps_middle.append(timestamp + timedelta(minutes=30))
      timestamps_final.append(timestamp + timedelta(hours=1))
    else:
      timestamp = list(temp_df.timestamp)[0]
      timestamps_middle.append(timestamp)
      #final
      temp_df = df_prompt_final.query('id_connexion == {}'.format(id_connexion))
      if temp_df.shape[0] == 0:
        # the user doesn't not provide us prompts
        timestamps_final.append(timestamp + timedelta(minutes=30))
      else:
        timestamps_final(list(temp_df.timestamp)[0])
  return [i.to_pydatetime() for i in timestamps_middle], [i.to_pydatetime() for i in timestamps_final]

In [None]:
middle_timestamp, end_timestamp = get_important_timestamps(nb_connection_1h, df["srl_prompt"].query("id_connexion in {}".format(nb_connection_1h)), 
                                                           df["srl_final_prompt"].query("id_connexion in {}".format(nb_connection_1h)), df["connexion"].query("id in {}".format(nb_connection_1h)))

##### Help Module Explored

In [None]:
temp_df = df["navigation"].query("id_connexion in {} and module == 'Aide'".format(nb_connection_1h))
for i,id in enumerate(nb_connection_1h):
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  explored = 0
  for j in range(temp_df2.shape[0]):
    if list(temp_df2.timestamp)[j] <= middle_timestamp[i]:
      explored = 1
      break
  data[i].append(explored)
columns.append("Help_Module_explored_middle")

##### Sujet Explored

In [None]:
temp_df = df["navigation"].query("id_connexion in {} and id_sujet in {}".format(nb_connection_1h, sujet_range))
for i,id in enumerate(nb_connection_1h):
  count = 0
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  for sujet in sujet_range:
    temp_df3 = temp_df2.query("id_sujet == {}".format(sujet))
    for j in range(temp_df3.shape[0]):
      if list(temp_df3.timestamp)[j] <= middle_timestamp[i]:
        count += 1
        break
  data[i].append(count)
columns.append("Sujet_Explored_middle")

##### Mouse hover zone

In [None]:
zones = ["editor", "grid", "task", "controls"]
temp_df = df["souris"].query("id_connexion in {}".format(nb_connection_1h, sujet_range))
for i,id in enumerate(nb_connection_1h):
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  for zone in zones:
    zone = 0
    temp_df3 = temp_df2.query("zone == {}".format(zone))
    for j in range(temp_df3.shape[0]):
      if list(temp_df3.timestamp)[j] <= middle_timestamp[i]:
        zone = 1
        break
    data[i].append(zone)
for zone in zones:
  columns.append("Mouse_Hover_{}_middle".format(zone))

#### Data linked to sujet/version

In [None]:
for i,id in enumerate(nb_connection_1h):
  for sujet in sujet_range:
    completness = 0
    for version in [2,3,4]:
      # validation number + experimentation number + validated
      df_temp = df["validation"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      df_temp2 = df_temp.query("experimentation == 0")
      # is validated ?
      df_temp3 = df_temp2.query("score > 0")
      validated = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] <= middle_timestamp[i]:
          validated = 1
          break 
      data[i].append(validated)
      completness=max(validated*(version-1), completness)
        
      # validation number
      count = 0
      for j in range(df_temp2.shape[0]):
        if list(df_temp2.timestamp)[j] <= middle_timestamp[i]:
          count += 1
      data[i].append(count)

      # experimentation number
      df_temp3 = df_temp.query("experimentation == 1")
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] <= middle_timestamp[i]:
          count += 1
      data[i].append(count)

      # modification number
      df_temp3 = df["modification"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] <= middle_timestamp[i]:
          count += 1
      data[i].append(count)


      # pas à pas number
      df_temp3 = df["pas_a_pas"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] <= middle_timestamp[i]:
          count += 1
      data[i].append(count)

      # help number
      df_temp3 = df["navigation"].query("id_connexion == {} and id_sujet == {} and version == {} and module == 'Aide'".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] <= middle_timestamp[i]:
          count += 1
      data[i].append(count)

      # time help module
      df_temp = df["navigation"].query("id_connexion == {}".format(id))
      indexes = [j for j in range(df_temp.shape[0]) if list(pd.to_datetime(df_temp.timestamp))[j] <= middle_timestamp[i]]
      df_temp = df_temp.iloc[indexes,:]
      count = timedelta(seconds=0)
      for j in range(df_temp.shape[0]):
        if list(df_temp.module)[j] == "Aide" and list(df_temp.id_sujet)[j] == sujet and list(df_temp.version)[j] == version:
          timestamps = pd.to_datetime(df_temp.timestamp)
          time = list(timestamps)[j]
          found = False
          if j == df_temp.shape[0] - 1:
            time = middle_timestamp[i] - time
          else:
            for k in range(j+1, df_temp.shape[0]):
              if not (list(df_temp.module)[k] == "Aide" and list(df_temp.id_sujet)[k] == sujet and list(df_temp.version)[k] == version):
                time = list(timestamps)[k] - time
                found = True
                break
            if not found:
              time = middle_timestamp[i] - time
          if isinstance(count, pd.Timestamp):
            print("{}, {}, {}, {}, {}".format(id, sujet, version, count, middle_timestamp[i]))
          count+=time
      data[i].append(count)

      # time on sujet version
      df_temp = df["navigation"].query("id_connexion == {}".format(id))
      indexes = [j for j in range(df_temp.shape[0]) if list(pd.to_datetime(df_temp.timestamp))[j] <= middle_timestamp[i]]
      df_temp = df_temp.iloc[indexes,:]
      count = timedelta(seconds=0)
      for j in range(df_temp.shape[0]):
        if list(df_temp.module)[j] == "Exercice" and list(df_temp.id_sujet)[j] == sujet and list(df_temp.version)[j] == version:
          timestamps = pd.to_datetime(df_temp.timestamp)
          time = list(timestamps)[j]
          if j == df_temp.shape[0] - 1:
            time = middle_timestamp[i] - time
          else:
            found = False
            for k in range(j+1, df_temp.shape[0]):
              if not ((list(df_temp.module)[k] == "Exercice" or list(df_temp.module)[k] == "Aide") and list(df_temp.id_sujet)[k] == sujet and list(df_temp.version)[k] == version):
                time = list(timestamps)[k] - time
                found = True
                break
            if not found:
              time = middle_timestamp[i] - time
          count+=time
      data[i].append(count)
    data[i].append(completness / 3)

for sujet in sujet_range:
  completness = 0
  for version in [2,3,4]:
    columns.append("Sujet_{}_Version_{}_middle".format(sujet, version))
    columns.append("nb_validation_{}_{}_middle".format(sujet, version))
    columns.append("nb_experimentation_{}_{}_middle".format(sujet, version))
    columns.append("nb_modification_{}_{}_middle".format(sujet, version))
    columns.append("nb_pasapas_{}_{}_middle".format(sujet, version))
    columns.append("nb_help_{}_{}_middle".format(sujet, version))
    columns.append("time_spent_help_{}_{}_middle".format(sujet, version))
    columns.append("time_spent_{}_{}_middle".format(sujet, version))
  columns.append("completness_{}_middle".format(sujet))

In [None]:
new_df = pd.DataFrame(data, columns=columns, index=nb_connection_1h)
new_df

Unnamed: 0,Already_used_QuickPi,Already_progammed,Home_or_School,Extern_Help_frequence,Reason,Motivation,Goal_Reach_middle,Time_Management_middle,Goal_Type,Help_Module_explored_middle,Sujet_Explored_middle,Mouse_Hover_editor_middle,Mouse_Hover_grid_middle,Mouse_Hover_task_middle,Mouse_Hover_controls_middle,Sujet_20_Version_2_middle,nb_validation_20_2_middle,nb_experimentation_20_2_middle,nb_modification_20_2_middle,nb_pasapas_20_2_middle,nb_help_20_2_middle,time_spent_help_20_2_middle,time_spent_20_2_middle,Sujet_20_Version_3_middle,nb_validation_20_3_middle,nb_experimentation_20_3_middle,nb_modification_20_3_middle,nb_pasapas_20_3_middle,nb_help_20_3_middle,time_spent_help_20_3_middle,time_spent_20_3_middle,Sujet_20_Version_4_middle,nb_validation_20_4_middle,nb_experimentation_20_4_middle,nb_modification_20_4_middle,nb_pasapas_20_4_middle,nb_help_20_4_middle,time_spent_help_20_4_middle,time_spent_20_4_middle,completness_20_middle,...,nb_experimentation_27_3_middle,nb_modification_27_3_middle,nb_pasapas_27_3_middle,nb_help_27_3_middle,time_spent_help_27_3_middle,time_spent_27_3_middle,Sujet_27_Version_4_middle,nb_validation_27_4_middle,nb_experimentation_27_4_middle,nb_modification_27_4_middle,nb_pasapas_27_4_middle,nb_help_27_4_middle,time_spent_help_27_4_middle,time_spent_27_4_middle,completness_27_middle,Sujet_28_Version_2_middle,nb_validation_28_2_middle,nb_experimentation_28_2_middle,nb_modification_28_2_middle,nb_pasapas_28_2_middle,nb_help_28_2_middle,time_spent_help_28_2_middle,time_spent_28_2_middle,Sujet_28_Version_3_middle,nb_validation_28_3_middle,nb_experimentation_28_3_middle,nb_modification_28_3_middle,nb_pasapas_28_3_middle,nb_help_28_3_middle,time_spent_help_28_3_middle,time_spent_28_3_middle,Sujet_28_Version_4_middle,nb_validation_28_4_middle,nb_experimentation_28_4_middle,nb_modification_28_4_middle,nb_pasapas_28_4_middle,nb_help_28_4_middle,time_spent_help_28_4_middle,time_spent_28_4_middle,completness_28_middle
191,,,,,,,,,,0,4,0,0,0,0,0,0,0,4,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,4,0,0,0 days,0 days 00:00:02,0.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000
225,,,,,,,,,,1,5,0,0,0,0,0,0,6,27,7,3,0 days 00:01:00,0 days 00:07:05,0,0,4,21,4,0,0 days,0 days 00:02:20,0,0,6,67,6,0,0 days,0 days 00:05:13,0.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000
234,,,,,,,,,,1,8,0,0,0,0,0,0,6,26,6,3,0 days 00:00:45,0 days 00:03:04,1,2,1,20,3,0,0 days,0 days 00:01:55,1,1,0,53,1,0,0 days,0 days 00:02:17,1.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333,1,1,1,17,2,0,0 days,0 days 00:01:09,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333
240,,,,,,,,,,1,3,0,0,0,0,1,1,4,17,5,1,0 days 00:00:10,0 days 00:01:38,1,1,3,17,4,0,0 days,0 days 00:01:39,1,7,1,104,11,0,0 days,0 days 00:05:59,1.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000
242,,,,,,,,,,0,2,0,0,0,0,0,0,0,0,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,,,,,,,,,,0,8,0,0,0,0,1,1,1,6,2,0,0 days 00:00:00,0 days 00:00:18,1,1,0,12,1,0,0 days,0 days 00:00:40,1,1,0,60,1,0,0 days,0 days 00:03:07,1.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:03,0.333333,1,1,0,12,1,0,0 days,0 days 00:01:29,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:05,0.333333
3269,,,,,,,,,,0,7,0,0,0,0,1,1,4,22,5,0,0 days 00:00:00,0 days 00:02:01,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:04,0.333333,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333,1,8,0,144,8,0,0 days,0 days 00:10:32,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:26,0.333333
3357,,,,,,,,,,0,4,0,0,0,0,0,0,0,0,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000
3987,,,,,,,,,,0,7,0,0,0,0,1,1,2,10,3,0,0 days 00:00:00,0 days 00:00:26,0,0,0,3,0,0,0 days,0 days 00:00:23,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333,...,0,0,0,0,0 days,0 days,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333,1,1,0,16,1,0,0 days,0 days 00:00:48,1,21,16,249,37,0,0 days,0 days 00:14:06,0,0,0,0,0,0,0 days,0 days 00:00:03,0.666667


### From middle prompt to final prompt or 1 hour

#### Prompts

In [None]:
def remap_prompt_final(df, key_map, column, nb_connection_1h, data, replace=True):
  df_temp = df["srl_final_prompt"].query("id_connexion in {}".format(nb_connection_1h))
  if replace:
    df_temp = df_temp.replace({column:key_map})
  for i,id in enumerate(nb_connection_1h):
    df_temp2 = df_temp.query("id_connexion == {}".format(id))
    if df_temp2.shape[0] == 0:
      data[i].append(np.nan)
    else:
      data[i].append(list(df_temp2[column])[0])
  return data

##### Strategie

In [None]:
key_map = {'Très mauvaise':0,'Mauvaise':1,'Ni bonne, ni mauvaise':2,'Bonne':3,'Très bonne':4}
data = remap_prompt_final(df, key_map, "strategie", nb_connection_1h, data, replace=False)
columns.append("Strategie")

##### Goal Reach

In [None]:
key_map = {'Pas du tout':0,'Pas trop':1,'Moyennement':2,'Plutôt oui':3,'Oui beaucoup':4}
data = remap_prompt_final(df, key_map, "objectif", nb_connection_1h, data, replace=False)
columns.append("Goal_Reach_final")

##### Time Management

In [None]:
key_map = {'Très mal':0,'Mal':1,'Ni bien, ni mal':2,'Bien':3,'Très bien':4}
data = remap_prompt_final(df, key_map, "temps", nb_connection_1h, data, replace=False)
columns.append("Time_Management_final")

#### other data

##### Help module explored

In [None]:
temp_df = df["navigation"].query("id_connexion in {} and module == 'Aide'".format(nb_connection_1h))
for i,id in enumerate(nb_connection_1h):
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  explored = 0
  for j in range(temp_df2.shape[0]):
    if list(temp_df2.timestamp)[j] > middle_timestamp[i] and list(temp_df2.timestamp)[j] <= end_timestamp[i]:
      explored = 1
      break
  data[i].append(explored)
columns.append("Help_Module_explored_final")

##### Sujet explored

In [None]:
temp_df = df["navigation"].query("id_connexion in {} and id_sujet in {}".format(nb_connection_1h, sujet_range))
for i,id in enumerate(nb_connection_1h):
  count = 0
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  for sujet in sujet_range:
    temp_df3 = temp_df2.query("id_sujet == {}".format(sujet))
    for j in range(temp_df3.shape[0]):
      if list(temp_df3.timestamp)[j] > middle_timestamp[i] and list(temp_df3.timestamp)[j] <= end_timestamp[i]:
        count += 1
        break
  data[i].append(count)
columns.append("Sujet_Explored_final")

##### Mouse hover zone

In [None]:
zones = ["editor", "grid", "task", "controls"]
temp_df = df["souris"].query("id_connexion in {}".format(nb_connection_1h, sujet_range))
for i,id in enumerate(nb_connection_1h):
  temp_df2 = temp_df.query("id_connexion == {}".format(id))
  for zone in zones:
    zone = 0
    temp_df3 = temp_df2.query("zone == {}".format(zone))
    for j in range(temp_df3.shape[0]):
      if list(temp_df3.timestamp)[j] > middle_timestamp[i] and list(temp_df3.timestamp)[j] <= end_timestamp[i]:
        zone = 1
        break
    data[i].append(zone)
for zone in zones:
  columns.append("Mouse_Hover_{}_final".format(zone))

#### dynamics data

In [None]:
for i,id in enumerate(nb_connection_1h):
  for sujet in sujet_range:
    completness = 0
    for version in [2,3,4]:
      # validation number + experimentation number + validated
      df_temp = df["validation"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      df_temp2 = df_temp.query("experimentation == 0")
      # is validated ?
      df_temp3 = df_temp2.query("score > 0")
      validated = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] > middle_timestamp[i] and list(df_temp3.timestamp)[j] <= end_timestamp[i]:
          validated = 1
          break 
      data[i].append(validated)
      completness=max(validated*(version-1), completness)
        
      # validation number
      count = 0
      for j in range(df_temp2.shape[0]):
        if list(df_temp2.timestamp)[j] > middle_timestamp[i] and list(df_temp2.timestamp)[j] <= end_timestamp[i]:
          count += 1
      data[i].append(count)

      # experimentation number
      df_temp3 = df_temp.query("experimentation == 1")
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] > middle_timestamp[i] and list(df_temp3.timestamp)[j] <= end_timestamp[i]:
          count += 1
      data[i].append(count)

      # modification number
      df_temp3 = df["modification"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] > middle_timestamp[i] and list(df_temp3.timestamp)[j] <= end_timestamp[i]:
          count += 1
      data[i].append(count)


      # pas à pas number
      df_temp3 = df["pas_a_pas"].query("id_connexion == {} and id_sujet == {} and version == {}".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] > middle_timestamp[i] and list(df_temp3.timestamp)[j] <= end_timestamp[i]:
          count += 1
      data[i].append(count)

      # help number
      df_temp3 = df["navigation"].query("id_connexion == {} and id_sujet == {} and version == {} and module == 'Aide'".format(id, sujet, version))
      count = 0
      for j in range(df_temp3.shape[0]):
        if list(df_temp3.timestamp)[j] > middle_timestamp[i] and list(df_temp3.timestamp)[j] <= end_timestamp[i]:
          count += 1
      data[i].append(count)

      # time help module
      df_temp = df["navigation"].query("id_connexion == {}".format(id))
      indexes = [j for j in range(df_temp.shape[0]) if list(df_temp.timestamp)[j] > middle_timestamp[i] and list(df_temp.timestamp)[j] <= end_timestamp[i]]
      df_temp = df_temp.iloc[indexes,:]
      count = timedelta(seconds=0)
      for j in range(df_temp.shape[0]):
        if list(df_temp.module)[j] == "Aide" and list(df_temp.id_sujet)[j] == sujet and list(df_temp.version)[j] == version:
          timestamps = pd.to_datetime(df_temp.timestamp)
          time = list(timestamps)[j]
          found = False
          if j == df_temp.shape[0] - 1:
            time = end_timestamp[i] - time
          else:
            for k in range(j+1, df_temp.shape[0]):
              if not (list(df_temp.module)[k] == "Aide" and list(df_temp.id_sujet)[k] == sujet and list(df_temp.version)[k] == version):
                time = list(timestamps)[k] - time
                found = True
                break
            if not found:
              time = end_timestamp[i] - time
          count+=time
      data[i].append(count)

      # time on sujet version
      df_temp = df["navigation"].query("id_connexion == {}".format(id))
      indexes = [j for j in range(df_temp.shape[0]) if list(df_temp.timestamp)[j] > middle_timestamp[i] and list(df_temp.timestamp)[j] <= end_timestamp[i]]
      df_temp = df_temp.iloc[indexes,:]
      count = timedelta(seconds=0)
      for j in range(df_temp.shape[0]):
        if list(df_temp.module)[j] == "Exercice" and list(df_temp.id_sujet)[j] == sujet and list(df_temp.version)[j] == version:
          timestamps = pd.to_datetime(df_temp.timestamp)
          time = list(timestamps)[j]
          if j == df_temp.shape[0] - 1:
            time = end_timestamp[i] - time
          else:
            found = False
            for k in range(j+1, df_temp.shape[0]):
              if not ((list(df_temp.module)[k] == "Exercice" or list(df_temp.module)[k] == "Aide") and list(df_temp.id_sujet)[k] == sujet and list(df_temp.version)[k] == version):
                
                time = list(timestamps)[k] - time
                found = True
                break
            if not found:
              time = end_timestamp[i] - time
          count+=time
      data[i].append(count)
    data[i].append(completness / 3)

for sujet in sujet_range:
  completness = 0
  for version in [2,3,4]:
    columns.append("Sujet_{}_Version_{}_final".format(sujet, version))
    columns.append("nb_validation_{}_{}_final".format(sujet, version))
    columns.append("nb_experimentation_{}_{}_final".format(sujet, version))
    columns.append("nb_modification_{}_{}_final".format(sujet, version))
    columns.append("nb_pasapas_{}_{}_final".format(sujet, version))
    columns.append("nb_help_{}_{}_final".format(sujet, version))
    columns.append("time_spent_help_{}_{}_final".format(sujet, version))
    columns.append("time_spent_{}_{}_final".format(sujet, version))
  columns.append("completness_{}_final".format(sujet))

In [None]:
new_df = pd.DataFrame(data, columns=columns, index=nb_connection_1h)
new_df

Unnamed: 0,Already_used_QuickPi,Already_progammed,Home_or_School,Extern_Help_frequence,Reason,Motivation,Goal_Reach_middle,Time_Management_middle,Goal_Type,Help_Module_explored_middle,Sujet_Explored_middle,Mouse_Hover_editor_middle,Mouse_Hover_grid_middle,Mouse_Hover_task_middle,Mouse_Hover_controls_middle,Sujet_20_Version_2_middle,nb_validation_20_2_middle,nb_experimentation_20_2_middle,nb_modification_20_2_middle,nb_pasapas_20_2_middle,nb_help_20_2_middle,time_spent_help_20_2_middle,time_spent_20_2_middle,Sujet_20_Version_3_middle,nb_validation_20_3_middle,nb_experimentation_20_3_middle,nb_modification_20_3_middle,nb_pasapas_20_3_middle,nb_help_20_3_middle,time_spent_help_20_3_middle,time_spent_20_3_middle,Sujet_20_Version_4_middle,nb_validation_20_4_middle,nb_experimentation_20_4_middle,nb_modification_20_4_middle,nb_pasapas_20_4_middle,nb_help_20_4_middle,time_spent_help_20_4_middle,time_spent_20_4_middle,completness_20_middle,...,nb_experimentation_27_3_final,nb_modification_27_3_final,nb_pasapas_27_3_final,nb_help_27_3_final,time_spent_help_27_3_final,time_spent_27_3_final,Sujet_27_Version_4_final,nb_validation_27_4_final,nb_experimentation_27_4_final,nb_modification_27_4_final,nb_pasapas_27_4_final,nb_help_27_4_final,time_spent_help_27_4_final,time_spent_27_4_final,completness_27_final,Sujet_28_Version_2_final,nb_validation_28_2_final,nb_experimentation_28_2_final,nb_modification_28_2_final,nb_pasapas_28_2_final,nb_help_28_2_final,time_spent_help_28_2_final,time_spent_28_2_final,Sujet_28_Version_3_final,nb_validation_28_3_final,nb_experimentation_28_3_final,nb_modification_28_3_final,nb_pasapas_28_3_final,nb_help_28_3_final,time_spent_help_28_3_final,time_spent_28_3_final,Sujet_28_Version_4_final,nb_validation_28_4_final,nb_experimentation_28_4_final,nb_modification_28_4_final,nb_pasapas_28_4_final,nb_help_28_4_final,time_spent_help_28_4_final,time_spent_28_4_final,completness_28_final
191,,,,,,,,,,0,4,0,0,0,0,0,0,0,4,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,4,0,0,0 days,0 days 00:00:02,0.000000,...,0,0,0,0,0 days,0 days 00:00:00,0,0,0,4,0,0,0 days,0 days 00:00:05,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
225,,,,,,,,,,1,5,0,0,0,0,0,0,6,27,7,3,0 days 00:01:00,0 days 00:07:05,0,0,4,21,4,0,0 days,0 days 00:02:20,0,0,6,67,6,0,0 days,0 days 00:05:13,0.000000,...,0,3,0,0,0 days,0 days 00:00:01,0,0,0,3,0,0,0 days,0 days 00:00:01,0.000000,0,0,1,44,1,0,0 days,0 days 00:01:52,0,0,0,7,0,0,0 days,0 days 00:00:04,0,0,0,3,0,0,0 days,0 days 00:00:02,0.0
234,,,,,,,,,,1,8,0,0,0,0,0,0,6,26,6,3,0 days 00:00:45,0 days 00:03:04,1,2,1,20,3,0,0 days,0 days 00:01:55,1,1,0,53,1,0,0 days,0 days 00:02:17,1.000000,...,0,63,3,0,0 days,0 days 00:03:39,0,0,0,7,0,0,0 days,0 days 00:18:05,0.666667,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
240,,,,,,,,,,1,3,0,0,0,0,1,1,4,17,5,1,0 days 00:00:10,0 days 00:01:38,1,1,3,17,4,0,0 days,0 days 00:01:39,1,7,1,104,11,0,0 days,0 days 00:05:59,1.000000,...,0,3,0,0,0 days,0 days 00:00:01,0,0,0,3,0,0,0 days,0 days 00:00:06,0.000000,0,4,0,48,7,0,0 days,0 days 00:04:08,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
242,,,,,,,,,,0,2,0,0,0,0,0,0,0,0,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,...,0,0,0,0,0 days,0 days 00:00:00,0,0,0,8,0,0,0 days,0 days 00:01:26,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3218,,,,,,,,,,0,8,0,0,0,0,1,1,1,6,2,0,0 days 00:00:00,0 days 00:00:18,1,1,0,12,1,0,0 days,0 days 00:00:40,1,1,0,60,1,0,0 days,0 days 00:03:07,1.000000,...,1,57,2,0,0 days,0 days 00:03:30,1,3,0,59,3,0,0 days,0 days 00:06:48,1.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
3269,,,,,,,,,,0,7,0,0,0,0,1,1,4,22,5,0,0 days 00:00:00,0 days 00:02:01,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:04,0.333333,...,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
3357,,,,,,,,,,0,4,0,0,0,0,0,0,0,0,0,0,0 days 00:00:00,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,...,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0
3987,,,,,,,,,,0,7,0,0,0,0,1,1,2,10,3,0,0 days 00:00:00,0 days 00:00:26,0,0,0,3,0,0,0 days,0 days 00:00:23,0,0,0,0,0,0,0 days,0 days 00:00:02,0.333333,...,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.000000,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0,0,0,0,0,0,0 days,0 days 00:00:00,0.0


In [None]:
new_df.to_csv("srl-dataset.csv")