In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Content Description
- **timestamp** is the moment the question was given, represented as Unix timestamp in milliseconds.
- **solving_id** represents each learning session of students corresponds to each bunle. It is a form of single
integer, starting from 1 .
- **question_id** is the ID of the question that given to student, which is a form of q{integer}.
- **user_answer** is the answer that the student submitted, recorded as a character between a and d inclusively.
- **elapsed_time** is the time that the students spends on each question in milliseconds.

#### Reading in Data

In [2]:
sample_data = pd.read_csv("Data/Sample_KT1/u1.csv")
print(f"Sample of the data set:\n{sample_data}")

Sample of the data set:
          timestamp  solving_id question_id user_answer  elapsed_time
0     1565096190868           1       q5012           b         38000
1     1565096221062           2       q4706           c         24000
2     1565096293432           3       q4366           b         68000
3     1565096339668           4       q4829           a         42000
4     1565096401774           5       q6528           b         59000
...             ...         ...         ...         ...           ...
1077  1569646668189         670       q3413           b          8000
1078  1569646668212         670       q3411           c          8000
1079  1569647442981         671       q2602           a          7000
1080  1569647442986         671       q2603           a          7000
1081  1569647443041         671       q2601           c          7000

[1082 rows x 5 columns]


In [3]:
question_data = pd.read_csv("Data/contents/questions.csv")
print(f"The answers for the questions associated with the data set:\n{question_data}")

The answers for the questions associated with the data set:
      question_id bundle_id explanation_id correct_answer  part  \
0              q1        b1             e1              b     1   
1              q2        b2             e2              a     1   
2              q3        b3             e3              b     1   
3              q4        b4             e4              b     1   
4              q5        b5             e5              c     1   
...           ...       ...            ...            ...   ...   
13164      q18139    b12202         e12202              b     2   
13165      q18140    b12203         e12203              a     2   
13166      q18141    b12204         e12204              a     2   
13167      q18142    b12205         e12205              a     2   
13168      q18143    b12206         e12206              c     2   

                   tags   deployed_at  
0           1;2;179;181  1.558090e+12  
1              15;2;182  1.558090e+12  
2          14;2

### Data Featuring
- remove the letters from the question, bundle, and explanation sections
- one-hot encoded the user_answer column in sample_data and correct_answer column in question_data

In [4]:
def clean_data(df, cols):
    # cols - cols to clean
    # df - dataframe
    for col in cols:
        df[col] = df[col].str[1:]

    return df

In [5]:
sample_data = clean_data(sample_data, ['question_id'])
question_data = clean_data(question_data, ['question_id', 'bundle_id', 'explanation_id'])
sample_data = pd.get_dummies(sample_data, columns=['user_answer'], drop_first=True, dtype=int)
question_data = pd.get_dummies(question_data, columns=['correct_answer'], drop_first=True, dtype=int)


print(f"Sample data set after basic data featuring:\n{sample_data}\n\n")
print(f"Question data set after basic data featuring:\n{question_data}")

Sample data set after basic data featuring:
          timestamp  solving_id question_id  elapsed_time  user_answer_b  \
0     1565096190868           1        5012         38000              1   
1     1565096221062           2        4706         24000              0   
2     1565096293432           3        4366         68000              1   
3     1565096339668           4        4829         42000              0   
4     1565096401774           5        6528         59000              1   
...             ...         ...         ...           ...            ...   
1077  1569646668189         670        3413          8000              1   
1078  1569646668212         670        3411          8000              0   
1079  1569647442981         671        2602          7000              0   
1080  1569647442986         671        2603          7000              0   
1081  1569647443041         671        2601          7000              0   

      user_answer_c  user_answer_d  
0     