In [1]:
from EduData import get_data

get_data("assistment-2009-2010-skill", "../../data")

downloader, INFO ../../data/2009_skill_builder_data_corrected.zip already exists. Send resume request after 9084422 bytes
downloader, INFO http://base.ustc.edu.cn/data/ASSISTment/2009_skill_builder_data_corrected.zip is saved as ../../data/2009_skill_builder_data_corrected.zip


Downloading 100.00% : 9084928 | 9084422

'../../data'

In [2]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    '../../data/2009_skill_builder_data_corrected/skill_builder_data_corrected.csv',
    usecols=['order_id', 'user_id', 'sequence_id', 'problem_id', 'correct']
)

In [3]:
raw_question = data.problem_id.unique().tolist()

def question_id_transfer(question):
    id2question = [p for p in raw_question]
    question2id = {}
    for i, p in enumerate(raw_question):
        question2id[p] = i

    return id2question, question2id


id2question, question2id = question_id_transfer(raw_question)

print("number of questions: %d" % len(raw_question))

number of questions: 26688


In [4]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend(student_sequence)
    return all_sequences


def parse_student_seq(student):
    student = student.drop_duplicates(subset='order_id')
    sequence_ids = student.sequence_id.unique()
    sequences = []
    for seq_id in sequence_ids:
        seq = student[student.sequence_id == seq_id].sort_values('order_id')
        questions = [question2id[id] for id in seq.problem_id.tolist()]
        answers = seq.correct.tolist()
        sequences.append((questions, answers))
    return sequences

# [(qustion_sequence_0, anser_sequence_0), ..., (qustion_sequence_n, anser_sequence_n)]
sequences = parse_all_seq(data.user_id.unique())

parse student sequence:	: 100%|██████████| 4217/4217 [00:46<00:00, 89.76it/s] 


In [5]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [6]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, '../../data/2009_skill_builder_data_corrected/train.txt')
sequences2tl(test_sequences, '../../data/2009_skill_builder_data_corrected/test.txt')

write into file: 100%|██████████| 41912/41912 [00:00<00:00, 195340.14it/s]
write into file: 100%|██████████| 17962/17962 [00:00<00:00, 189350.67it/s]


In [7]:
MAX_STEP = 50
NUM_QUESTIONS = 26688


def encode_onehot(sequences, max_step, num_questions):
    question_sequences = np.array([])
    answer_sequences = np.array([])
    onehot_result = []

    for questions, answers in tqdm.tqdm(sequences, 'convert to onehot format: '):
        length = len(questions)
        # append questions' and answers' length to an integer multiple of max_step
        mod = 0 if length % max_step == 0 else (max_step - length % max_step)
        fill_content = np.zeros(mod) - 1
        questions = np.append(questions, fill_content)
        answers = np.append(answers, fill_content)
        # onehot
        q_seqs = questions.reshape([-1, max_step])
        a_seqs = answers.reshape([-1, max_step])
        for (i, q_seq) in enumerate(q_seqs):
            onehot = np.zeros(shape = [max_step, 2 * num_questions])
            for j in range(max_step):
                index = int(q_seq[j] if a_seqs[i][j] > 0 else q_seq[j] + num_questions)
                onehot[j][index] = 1
            onehot_result = np.append(onehot_result, onehot)
    
    return onehot_result.reshape(-1, max_step, 2 * num_questions)


# reduce the amount of data for example running faster
percentage = 0.005
train_data = encode_onehot(train_sequences[: int(len(train_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)
test_data = encode_onehot(test_sequences[: int(len(test_sequences) * percentage)], MAX_STEP, NUM_QUESTIONS)

convert to onehot format: 100%|██████████| 209/209 [05:05<00:00,  1.46s/it]
convert to onehot format: 100%|██████████| 89/89 [00:50<00:00,  1.78it/s]


In [8]:
# save onehot data
np.save('../../data/2009_skill_builder_data_corrected/train_data.npy', train_data)
np.save('../../data/2009_skill_builder_data_corrected/test_data.npy', test_data)