In [None]:
from EduData import get_data
import os

data_path = '../../data/2009_skill_builder_data_corrected/'
file_name = data_path + 'skill_builder_data_corrected.csv'
if not os.path.exists(file_name):
    get_data("assistment-2009-2010-skill", "../../data")

In [None]:
import pandas as pd
import tqdm

data = pd.read_csv(
    file_name ,
    usecols=['user_id', 'problem_id', 'skill_id', 'attempt_count', 'hint_count', 'correct','ms_first_response']
).dropna(subset=['skill_id', 'problem_id'])
data['time_first_res'] = data['ms_first_response'] / 1000

In [None]:
skills = data.skill_id.unique().tolist()
problems = data.problem_id.unique().tolist()
users = data.user_id.unique()

# question id from 1 to #num_skill
skill2id = { p: i+1 for i, p in enumerate(skills) }
problem2id = { p: i+1 for i, p in enumerate(problems) }


print("number of users: %d" % len(users))
print("number of skills: %d" % len(skills))
print("number of problems: %d" % len(problems))

In [None]:
import numpy as np
problem2skill = {}
for s, p in zip(np.array(data.skill_id), np.array(data.problem_id)):
    problem2skill[problem2id[p]] = skill2id[s]
with open(data_path + 'problem2skill', 'w', encoding='utf-8') as f:
    f.write(str(problem2skill))

In [None]:
from sklearn.model_selection import train_test_split, KFold
from scipy.stats import norm
from scipy.stats import poisson

train_student_ids, test_student_ids = train_test_split(users, test_size=0.2, random_state=42)

train_data = data[data['user_id'].isin(train_student_ids)]


# compute the mean and variance of the response time for each question
question_time_stats = train_data.groupby('problem_id')['time_first_res'].agg(['mean', 'std']).reset_index()

# merge the time statistics to the original data
data = pd.merge(data, question_time_stats, on='problem_id')
data['std'] = data['std'].fillna(0)
print("finish merging")

# compute the time factor with its distribution
data['time_factor'] = data.apply(lambda row: 1 if row['std'] == 0 else norm(row['mean'], row['std']).cdf(np.log(row['time_first_res'])), axis=1)
data = data.dropna(subset = ['time_factor'])
print("Finish processing time features ")

In [None]:
# compute the mean of the attempts
question_attempt_stats = train_data.groupby('problem_id')['attempt_count'].mean().reset_index()
question_attempt_stats.rename(columns = {'attempt_count':'mean_attempt'}, inplace = True)
# merge the attempts statistics to the original data
data = pd.merge(data, question_attempt_stats, on='problem_id', suffixes=('', '_attempt'))

# compute the attempt factor with its distribution
data['attempt_factor'] = 1 - poisson(data['mean_attempt']).cdf(data['attempt_count'] - 1)
print("Finish processing attempt features ")

In [None]:
# compute the mean of the hints
question_hint_stats = train_data.groupby('problem_id')['hint_count'].agg('mean').reset_index()
question_hint_stats.rename(columns = {'hint_count':'mean_hint'}, inplace = True)
# merge the hints statistics to the original data
data = pd.merge(data, question_hint_stats, on='problem_id')

# compute the hint factor with its distribution
data['hint_factor'] = 1 - poisson(data['mean_hint']).cdf(data['hint_count'] - 1)

print("Finish processing hint features ")

In [None]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student
    s = [skill2id[q] for q in seq.skill_id.tolist()]
    a = seq.correct.tolist()
    p = [problem2id[p] for p in seq.problem_id.tolist()]
    time_factor = seq.time_factor.tolist()
    attempt_factor = seq.attempt_factor.tolist()
    hint_factor = seq.hint_factor.tolist()

    return s, a, p, time_factor,attempt_factor,hint_factor


train_data = np.array(parse_all_seq(train_student_ids))
test_data = np.array(parse_all_seq(test_student_ids))

In [None]:
def sequences2l(sequences, trg_path):
    with open(trg_path, 'w', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write data into file: %s' % trg_path):
            s_seq, a_seq, p_seq, time_seq, attempt_seq, hint_seq = seq
            seq_len = len(s_seq)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(s) for s in s_seq]) + '\n')
            f.write(','.join([str(a) for a in a_seq]) + '\n')
            f.write(','.join([str(p) for p in p_seq]) + '\n')
            f.write(','.join([format(t, '.6f') for t in time_seq]) + '\n')
            f.write(','.join([format(att, '.6f') for att in attempt_seq]) + '\n')
            f.write(','.join([format(h, '.6f') for h in hint_seq]) + '\n')

sequences2l(train_data, data_path + 'train.txt')
sequences2l(test_data, data_path + 'test.txt')