In [1]:
import csv
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
def stat_unique(data: pd.DataFrame, key):
    if key is None:
        print('Total length: {}'.format(len(data)))
    elif isinstance(key, str):
        print('Number of unique {}: {}'.format(key, len(data[key].unique())))
    elif isinstance(key, list):
        print('Number of unique [{}]: {}'.format(','.join(key), len(data.drop_duplicates(key, keep='first'))))

In [3]:
data_path ='../../data/assistment/'
raw_data = pd.read_csv('../../data/assistment/assistment.csv', encoding = 'utf-8', dtype={'skill_id': str})
raw_data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,tutor_mode,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,33022537,277618,64525,33139,51424,1,1,1,32454,tutor,...,0,3,32454,30799,,26,0,,1,1.0
1,33022709,277618,64525,33150,51435,1,1,1,4922,tutor,...,0,3,4922,30799,,55,0,,2,2.0
2,35450204,220674,70363,33159,51444,1,0,2,25390,tutor,...,0,3,42000,30799,,88,0,,1,1.0
3,35450295,220674,70363,33110,51395,1,1,1,4859,tutor,...,0,3,4859,30059,,41,0,,2,2.0
4,35450311,220674,70363,33196,51481,1,0,14,19813,tutor,...,3,4,124564,30060,,65,0,0.0,3,3.0


In [4]:
raw_data = raw_data.rename(columns={'user_id': 'student_id',
                                    'problem_id': 'question_id',
                                    'skill_id': 'knowledge_id',
                                    'skill_name': 'knowledge_name',
                                    })
all_data = raw_data.loc[:, ['student_id', 'question_id', 'knowledge_id', 'knowledge_name', 'correct']].dropna()

In [5]:
stat_unique(all_data, None)
stat_unique(all_data, ['student_id', 'question_id'])
stat_unique(all_data, 'student_id')
stat_unique(all_data, 'question_id')
stat_unique(all_data, 'knowledge_id')
ques_num = len(all_data['question_id'].unique())
know_num = len(all_data['knowledge_id'].unique())

Total length: 325636
Number of unique [student_id,question_id]: 270477
Number of unique student_id: 4151
Number of unique question_id: 16891
Number of unique knowledge_id: 111


## Filter data

In [6]:
selected_data = all_data

In [7]:
# filter questions
n_students = selected_data.groupby('question_id')['student_id'].count()
question_filter = n_students[n_students < 50].index.tolist()
print(f'filter {len(question_filter)} questions')
selected_data = selected_data[~selected_data['question_id'].isin(question_filter)]

filter 15370 questions


In [8]:
# filter students
n_questions = selected_data.groupby('student_id')['question_id'].count()
student_filter = n_questions[n_questions < 10].index.tolist()
print(f'filter {len(student_filter)} students')
selected_data = selected_data[~selected_data['student_id'].isin(student_filter)]

filter 1471 students


In [9]:
# get question to knowledge map
q2k = {}
table = selected_data.loc[:, ['question_id', 'knowledge_id']].drop_duplicates()
for i, row in table.iterrows():
    q = row['question_id']
    q2k[q] = set(map(int, str(row['knowledge_id']).split('_')))
    
# get knowledge to question map
k2q = {}
for q, ks in q2k.items():
    for k in ks:
        k2q.setdefault(k, set())
        k2q[k].add(q)

In [10]:
# filter knowledges
selected_knowledges = { k for k, q in k2q.items() if len(q) >= 10}
print(f'filter {len(k2q) - len(selected_knowledges)} knowledges')

filter 8 knowledges


In [11]:
# update maps
q2k = {q : ks for q, ks in q2k.items() if ks & selected_knowledges}
k2q = {}
for q, ks in q2k.items():
    for k in ks:
        k2q.setdefault(k, set())
        k2q[k].add(q)

In [12]:
# update data
selected_data = selected_data[selected_data.apply(lambda x: x['question_id'] in q2k, axis=1)]

In [13]:
# renumber the students
s2n = {}
cnt = 0
for i, row in selected_data.iterrows():
    if row.student_id not in s2n:
        s2n[row.student_id] = cnt
        cnt += 1
selected_data.loc[:, 'student_id'] = selected_data.loc[:, 'student_id'].apply(lambda x: s2n[x])

In [14]:
# renumber the questions
q2n = {}
cnt = 0
for i, row in selected_data.iterrows():
    if row.question_id not in q2n:
        q2n[row.question_id] = cnt
        cnt += 1
selected_data.loc[:, 'question_id'] = selected_data.loc[:, 'question_id'].apply(lambda x: q2n[x])

In [15]:
# renumber the knowledges
k2n = {}
cnt = 0
for i, row in selected_data.iterrows():
    for k in str(row.knowledge_id).split('_'):
        if int(k) not in k2n:
            k2n[int(k)] = cnt
            cnt += 1
selected_data.loc[:, 'knowledge_id'] = selected_data.loc[:, 'knowledge_id'].apply(lambda x: '_'.join(map(lambda y: str(k2n[int(y)]), str(x).split('_'))))


In [16]:
stat_unique(selected_data, None)
stat_unique(selected_data, ['student_id', 'question_id'])
stat_unique(selected_data, 'student_id')
stat_unique(selected_data, 'question_id')
stat_unique(selected_data, 'knowledge_id')
print('Average #questions per knowledge: {}'.format((len(q2k) / len(k2q))))

Total length: 110398
Number of unique [student_id,question_id]: 78747
Number of unique student_id: 1940
Number of unique question_id: 1485
Number of unique knowledge_id: 35
Average #questions per knowledge: 59.4


In [17]:
# save selected data
selected_data.to_csv(data_path+'selected_data.csv', index=False)

In [18]:
# save concept map
q2k = {}
table = selected_data.loc[:, ['question_id', 'knowledge_id']].drop_duplicates()
for i, row in table.iterrows():
    q = str(row['question_id'])
    q2k[q] = list(map(int, str(row['knowledge_id']).split('_')))
with open('concept_map.json', 'w') as f:
    json.dump(q2k, f)

## parse data

In [19]:
def parse_data(data):
    """ 

    Args:
        data: list of triplets (sid, qid, score)
        
    Returns:
        student based datasets: defaultdict {sid: {qid: score}}
        question based datasets: defaultdict {qid: {sid: score}}
    """
    stu_data = defaultdict(lambda: defaultdict(dict))
    ques_data = defaultdict(lambda: defaultdict(dict))
    for i, row in data.iterrows():
        sid = row.student_id
        qid = row.question_id
        correct = row.correct
        stu_data[sid][qid] = correct
        ques_data[qid][sid] = correct
    return stu_data, ques_data

In [20]:
data = []
for i, row in selected_data.iterrows():
    data.append([row.student_id, row.question_id, row.correct])

In [21]:
stu_data, ques_data = parse_data(selected_data)

In [22]:
test_size = 0.2
least_test_length=150

In [23]:
n_students = len(stu_data)
if isinstance(test_size, float):
    test_size = int(n_students * test_size)
train_size = n_students - test_size
assert(train_size > 0 and test_size > 0)

students = list(range(n_students))
random.shuffle(students)
if least_test_length is not None:
    student_lens = defaultdict(int)
    for t in data:
        student_lens[t[0]] += 1
    students = [student for student in students
                if student_lens[student] >= least_test_length]
test_students = set(students[:test_size])

train_data = [record for record in data if record[0] not in test_students]
test_data = [record for record in data if record[0] in test_students]

In [24]:
def renumber_student_id(data):
    """

    Args:
        data: list of triplets (sid, qid, score)
    
    Returns:
        renumbered datasets: list of triplets (sid, qid, score)
    """
    student_ids = sorted(set(t[0] for t in data))
    renumber_map = {sid: i for i, sid in enumerate(student_ids)}
    data = [(renumber_map[t[0]], t[1], t[2]) for t in data]
    return data

In [25]:
train_data = renumber_student_id(train_data)
test_data = renumber_student_id(test_data)
all_data = renumber_student_id(data)

In [26]:
print(f'train records length: {len(train_data)}')
print(f'test records length: {len(test_data)}')
print(f'all records length: {len(all_data)}')

train records length: 60393
test records length: 50005
all records length: 110398


## save data

In [27]:
def save_to_csv(data, path):
    """

    Args:
        data: list of triplets (sid, qid, correct)
        path: str representing saving path
    """
    pd.DataFrame.from_records(sorted(data), columns=['student_id', 'question_id', 'correct']).to_csv(path, index=False)

In [28]:
save_to_csv(train_data, data_path+'train_triples.csv')
save_to_csv(test_data, data_path+'test_triples.csv')
save_to_csv(all_data, data_path+'triples.csv')

In [29]:
metadata = {"num_students": n_students, 
            "num_questions": ques_num,
            "num_concepts": know_num, 
            "num_records": len(all_data), 
            "num_train_students": n_students - len(test_students), 
            "num_test_students": len(test_students)}

In [30]:
with open(data_path+'metadata.json', 'w') as f:
    json.dump(metadata, f)