In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from datetime import datetime
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
df = (pd.read_csv('./data/algebra_2005_2006_train.txt',sep='\t').dropna(subset=['Step Start Time', 'KC(Default)'])
                           .rename(columns={
                                    'Anon Student Id': 'user_id',
                                    'Correct First Attempt': 'correct'
                                   }))
df

Unnamed: 0,Row,user_id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),correct,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,40.0,,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,16.0,16.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,36.0,,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2
3,4,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,-8y = -6,2005-09-09 12:26:12.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,22.0,22.0,,1,0,0,1,"[SkillRule: Remove coefficient; {ax+b=c, divid...",1~~1
4,5,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,2,-7y-5 = -4,2005-09-09 12:26:38.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,118.0,118.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",3~~1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809688,1080611,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x = 4,2006-03-09 10:52:45.0,2006-03-09 10:57:52.0,2006-03-09 10:58:05.0,2006-03-09 10:58:05.0,320.0,,320.0,0,0,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",4~~2
809689,1080612,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x+7 = 4+7,2006-03-09 10:58:05.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,8.0,8.0,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],5
809690,1080613,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 4+7,2006-03-09 10:58:13.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,5.0,5.0,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],6
809691,1080614,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 11,2006-03-09 10:58:18.0,2006-03-09 10:58:22.0,2006-03-09 10:58:39.0,2006-03-09 10:58:39.0,21.0,,21.0,0,1,3,1,[SkillRule: Remove positive coefficient; {ax/b...,1~~4


In [5]:
df['item_id'] = df['Problem Name'] + '~' + df['Step Name']

In [6]:
values = sorted(df['Step Start Time'].unique())
print(values[:5], values[-5:])

df['timestamp'] = pd.to_datetime(df['Step Start Time']).map(
    lambda t: t.timestamp()).round().astype(np.int32)

# Check if skills are a function of the item_id (answer: NO)
steps_with_kc = df.groupby(['item_id', 'KC(Default)']).size().reset_index()
print(Counter(steps_with_kc['item_id']).most_common(5))
print(steps_with_kc.query('`item_id` == "LIT59~b+r*(x+y) = v-s"'))

all_skills = set()
for skill in df['KC(Default)']:
    for token in skill.split('~~'):
        all_skills.add(token)
encode = dict(zip(all_skills, range(1000000)))

skill_ids = []
for skills in df['KC(Default)']:
    skill_ids.append('~~'.join(str(encode[skill])
                               for skill in skills.split('~~')))
df['skill_ids'] = skill_ids

['2005-08-30 09:50:10.0', '2005-08-30 09:50:14.0', '2005-08-30 09:50:16.0', '2005-08-30 09:50:25.0', '2005-08-30 09:50:32.0'] ['2006-06-07 11:11:34.0', '2006-06-07 11:11:39.0', '2006-06-07 11:12:21.0', '2006-06-07 11:12:24.0', '2006-06-07 11:12:31.0']
[('EG57A~FinalAnswer', 24), ('EG57~FinalAnswer', 23), ('EG54~FinalAnswer', 18), ('EG40~FinalAnswer', 17), ('EG41~FinalAnswer', 14)]
                      item_id  \
163264  LIT59~b+r*(x+y) = v-s   
163265  LIT59~b+r*(x+y) = v-s   
163266  LIT59~b+r*(x+y) = v-s   
163267  LIT59~b+r*(x+y) = v-s   
163268  LIT59~b+r*(x+y) = v-s   

                                              KC(Default)   0  
163264  [SkillRule: Eliminate Parens; {CLT nested; CLT...  24  
163265  [SkillRule: Eliminate Parens; {CLT nested; CLT...   1  
163266  [SkillRule: Remove coefficient; {ax+b=c, divid...   5  
163267  [SkillRule: Remove constant; {ax+b=c, positive...  31  
163268  [SkillRule: Remove constant; {ax+b=c, positive...   2  


In [7]:
df

Unnamed: 0,Row,user_id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,...,Error Step Duration (sec),correct,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default),item_id,timestamp,skill_ids
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,...,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1,EG4-FIXED~3(x+2) = 15,1126268675,15
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,...,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1,EG4-FIXED~x+2 = 5,1126268715,1~~73
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,...,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2,EG40~2-8y = -4,1126268736,1
3,4,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,-8y = -6,2005-09-09 12:26:12.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,...,,1,0,0,1,"[SkillRule: Remove coefficient; {ax+b=c, divid...",1~~1,EG40~-8y = -6,1126268772,74~~59
4,5,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,2,-7y-5 = -4,2005-09-09 12:26:38.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,...,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",3~~1,EG40~-7y-5 = -4,1126268798,1~~104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809688,1080611,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x = 4,2006-03-09 10:52:45.0,2006-03-09 10:57:52.0,2006-03-09 10:58:05.0,2006-03-09 10:58:05.0,...,320.0,0,0,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",4~~2,EG40~-7+2x = 4,1141901565,1~~104
809689,1080612,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x+7 = 4+7,2006-03-09 10:58:05.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,...,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],5,EG40~-7+2x+7 = 4+7,1141901885,34
809690,1080613,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 4+7,2006-03-09 10:58:13.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,...,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],6,EG40~2x = 4+7,1141901893,34
809691,1080614,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 11,2006-03-09 10:58:18.0,2006-03-09 10:58:22.0,2006-03-09 10:58:39.0,2006-03-09 10:58:39.0,...,21.0,0,1,3,1,[SkillRule: Remove positive coefficient; {ax/b...,1~~4,EG40~2x = 11,1141901898,61~~74


In [8]:
exemple = df[['user_id', 'item_id', 'skill_ids', 'timestamp', 'correct']]
exemple

Unnamed: 0,user_id,item_id,skill_ids,timestamp,correct
0,0BrbPbwCMz,EG4-FIXED~3(x+2) = 15,15,1126268675,0
1,0BrbPbwCMz,EG4-FIXED~x+2 = 5,1~~73,1126268715,1
2,0BrbPbwCMz,EG40~2-8y = -4,1,1126268736,0
3,0BrbPbwCMz,EG40~-8y = -6,74~~59,1126268772,1
4,0BrbPbwCMz,EG40~-7y-5 = -4,1~~104,1126268798,1
...,...,...,...,...,...
809688,IQQo3367x0,EG40~-7+2x = 4,1~~104,1141901565,0
809689,IQQo3367x0,EG40~-7+2x+7 = 4+7,34,1141901885,1
809690,IQQo3367x0,EG40~2x = 4+7,34,1141901893,1
809691,IQQo3367x0,EG40~2x = 11,61~~74,1141901898,0


In [None]:
exemple