In [1]:
import pandas as pd
import numpy as np

In [2]:
questions = pd.read_csv('../data/simple_qstats_v2.csv')

In [3]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly
0,1,5.0,100,15386,0.3536
1,1,5.0,12882,161,0.3602
2,1,5.0,7859,47486,0.3812
3,1,5.0,157,33138,0.566
4,1,5.0,10426,8606,0.5681


In [4]:
questions_per_parts = pd.DataFrame(questions.groupby('part')['content_id'].count()).reset_index().set_index('part')

In [5]:
questions_per_parts['ratio %'] = round(questions_per_parts['content_id'] / 13523 * 100, 2)

In [6]:
questions_per_parts

Unnamed: 0_level_0,content_id,ratio %
part,Unnamed: 1_level_1,Unnamed: 2_level_1
1,992,7.34
2,1647,12.18
3,1562,11.55
4,1439,10.64
5,5511,40.75
6,1212,8.96
7,1160,8.58


In [7]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [8]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [9]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly,difficulty
0,1,5.0,100,15386,0.3536,3
1,1,5.0,12882,161,0.3602,3
2,1,5.0,7859,47486,0.3812,3
3,1,5.0,157,33138,0.566,3
4,1,5.0,10426,8606,0.5681,3


In [10]:
questions_per_parts_and_difficulty = pd.DataFrame(questions.groupby(['part','difficulty'])['content_id'].count())

In [11]:
questions_per_parts_and_difficulty['ratio %'] = round(questions_per_parts_and_difficulty['content_id'] / 13523 * 100, 2)

In [12]:
def toeic_questions(df):
    random_n = np.random.randint(0, 13524, 1000)
    return df['content_id'].get(random_n)

In [13]:
toeic_quest = pd.DataFrame(toeic_questions(questions))

In [14]:
toeic_quest = toeic_quest.merge(questions, on="content_id", how="inner")

In [16]:
toeic_quest_nb = pd.DataFrame(toeic_quest.groupby(['part','difficulty'])['content_id'].count())

In [17]:
toeic_quest_nb

Unnamed: 0_level_0,Unnamed: 1_level_0,content_id
part,difficulty,Unnamed: 2_level_1
1,1,38
1,2,21
1,3,11
2,1,34
2,2,44
2,3,30
3,1,45
3,2,49
3,3,35
4,1,37


In [18]:
textbook = pd.DataFrame([question for question in questions.content_id if question not in list(toeic_quest['content_id'])])

In [19]:
toeic_quest['content_id'].nunique()

966

In [22]:
textbook.set_axis(['content_id'], axis='columns', inplace=True)

In [24]:
textbook = textbook.merge(questions, on='content_id', how='inner')

In [25]:
textbook_per_parts_and_difficulty = pd.DataFrame(textbook.groupby(['part','difficulty'])['content_id'].count())

In [26]:
textbook_per_parts_and_difficulty['ratio %'] = round(textbook_per_parts_and_difficulty['content_id'] / 12523 * 100, 2)

In [28]:
comparison = pd.DataFrame(questions_per_parts_and_difficulty.merge(textbook_per_parts_and_difficulty, on=['part','difficulty']))

In [29]:
comparison.columns = ['total','ratio_total','textbook','ratio_textbook']

In [30]:
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,total,ratio_total,textbook,ratio_textbook
part,difficulty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,559,4.13,523,4.18
1,2,295,2.18,274,2.19
1,3,138,1.02,127,1.01
2,1,449,3.32,417,3.33
2,2,796,5.89,752,6.0
2,3,402,2.97,373,2.98
3,1,553,4.09,510,4.07
3,2,591,4.37,545,4.35
3,3,418,3.09,385,3.07
4,1,512,3.79,475,3.79


In [33]:
print('nb questions total :', comparison['total'].sum())
print('nb questions toeic :', toeic_quest['content_id'].nunique())
print('nb questions textbook :', comparison['textbook'].sum())

nb questions total : 13523
nb questions toeic : 966
nb questions textbook : 12557


# Random Textbook

### Selection of random questions equally distributed among each part

In [111]:
def random_sample_sequence(df, n_quest):
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1 = questions[questions['part']==1].reset_index()
    part_2 = questions[questions['part']==2].reset_index()
    part_3 = questions[questions['part']==3].reset_index()
    part_4 = questions[questions['part']==4].reset_index()
    part_5 = questions[questions['part']==5].reset_index()
    part_6 = questions[questions['part']==6].reset_index()
    part_7 = questions[questions['part']==7].reset_index()
    
    i_1 = np.random.randint(0, len(part_1), n_quest)
    i_2 = np.random.randint(0, len(part_2), n_quest)
    i_3 = np.random.randint(0, len(part_3), n_quest)
    i_4 = np.random.randint(0, len(part_4), n_quest)
    i_5 = np.random.randint(0, len(part_5), n_quest)
    i_6 = np.random.randint(0, len(part_6), n_quest)
    i_7 = np.random.randint(0, len(part_7), n_quest)
    
    sequence_part_1 = []
    for i in i_1:
        sequence_part_1.append(part_1.loc[i, 'content_id'])
    
    sequence_part_2 = []
    for i in i_2:
        sequence_part_2.append(part_2.loc[i, 'content_id'])
        
    sequence_part_3 = []
    for i in i_3:
        sequence_part_3.append(part_3.loc[i, 'content_id'])    
    
    sequence_part_4 = []
    for i in i_4:
        sequence_part_4.append(part_4.loc[i, 'content_id'])
        
    sequence_part_5 = []
    for i in i_5:
        sequence_part_5.append(part_5.loc[i, 'content_id']) 
        
    sequence_part_6 = []
    for i in i_6:
        sequence_part_6.append(part_6.loc[i, 'content_id'])
    
    sequence_part_7 = []
    for i in i_7:
        sequence_part_7.append(part_7.loc[i, 'content_id'])
    
    return [sequence_part_1,sequence_part_2,sequence_part_3,sequence_part_4,sequence_part_5,sequence_part_6,sequence_part_7]

In [112]:
def get_random_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = random_sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [147]:
random_path = get_random_training_path(textbook, 10, 5)

In [148]:
random_path

array([[[  133, 10444, 12922, 10403, 10563],
        [12141,  1033,   713,   908,   868],
        [11853,  1642,  2022,  1904,  2320],
        [ 3190,  3250,  3234, 13145,  2770],
        [ 4847, 13258,  8653,  6675, 13501],
        [10885, 10835, 10304, 10266,  6608],
        [ 7239,  7408,  7212, 11687,  7369]],

       [[ 7884, 10413, 10656, 12870,  7947],
        [12197,   737, 12004,  1379, 12066],
        [ 2463,  1677,  1703,  2053, 12318],
        [ 2633,  3518,  2857,  2948,  3486],
        [ 4313,  5928,  9413,  4443,  8718],
        [10160,  6626,  6492, 10302,  6561],
        [ 8058,  7426,  7361,  7237,  7396]],

       [[   52,    21, 10538,  7858, 12661],
        [  939,   341, 12104,  1270,  1193],
        [ 2504, 12356,  2049,  1660, 13239],
        [13140,  2705,  3399,  3424,  3234],
        [ 9450, 13363,  4090,  9235,  3822],
        [10871, 10237, 10881, 10771, 10882],
        [ 7748,  7826,  7354,  7198,  7641]],

       [[   66,    75, 10645,  7868, 10637],
    

# Evolutive Textbook based on difficulty

### Selection of questions equally distributed among each part & with progressive difficulty

In [133]:
def sample_sequence(df, n_quest):
    
    diff_1 = int(n_quest * 0.4)
    diff_2 = int(n_quest * 0.4)
    diff_3 = int(n_quest * 0.2)
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1_diff_1 = questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()
    part_1_diff_2 = questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()
    part_1_diff_3 = questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()
    part_2_diff_1 = questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()
    part_2_diff_2 = questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()
    part_2_diff_3 = questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()
    part_3_diff_1 = questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()
    part_3_diff_2 = questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()
    part_3_diff_3 = questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()
    part_4_diff_1 = questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()
    part_4_diff_2 = questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()
    part_4_diff_3 = questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()
    part_5_diff_1 = questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()
    part_5_diff_2 = questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()
    part_5_diff_3 = questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()
    part_6_diff_1 = questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()
    part_6_diff_2 = questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()
    part_6_diff_3 = questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()
    part_7_diff_1 = questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()
    part_7_diff_2 = questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()
    part_7_diff_3 = questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()
    
    i_1_1 = np.random.randint(0, len(part_1_diff_1), diff_1)
    i_1_2 = np.random.randint(0, len(part_1_diff_2), diff_2)
    i_1_3 = np.random.randint(0, len(part_1_diff_3), diff_3)
    i_2_1 = np.random.randint(0, len(part_2_diff_1), diff_1)
    i_2_2 = np.random.randint(0, len(part_2_diff_2), diff_2)
    i_2_3 = np.random.randint(0, len(part_2_diff_3), diff_3)
    i_3_1 = np.random.randint(0, len(part_3_diff_1), diff_1)
    i_3_2 = np.random.randint(0, len(part_3_diff_2), diff_2)
    i_3_3 = np.random.randint(0, len(part_3_diff_3), diff_3)
    i_4_1 = np.random.randint(0, len(part_4_diff_1), diff_1)
    i_4_2 = np.random.randint(0, len(part_4_diff_2), diff_2)
    i_4_3 = np.random.randint(0, len(part_4_diff_3), diff_3)
    i_5_1 = np.random.randint(0, len(part_5_diff_1), diff_1)
    i_5_2 = np.random.randint(0, len(part_5_diff_2), diff_2)
    i_5_3 = np.random.randint(0, len(part_5_diff_3), diff_3)
    i_6_1 = np.random.randint(0, len(part_6_diff_1), diff_1)
    i_6_2 = np.random.randint(0, len(part_6_diff_2), diff_2)
    i_6_3 = np.random.randint(0, len(part_6_diff_3), diff_3)
    i_7_1 = np.random.randint(0, len(part_7_diff_1), diff_1)
    i_7_2 = np.random.randint(0, len(part_7_diff_2), diff_2)
    i_7_3 = np.random.randint(0, len(part_7_diff_3), diff_3)
    
    sequence_part_1 = []
    for i in i_1_1:
        sequence_part_1.append(part_1_diff_1.loc[i, 'content_id'])
    for i in i_1_2:
        sequence_part_1.append(part_1_diff_2.loc[i, 'content_id'])
    for i in i_1_3:
        sequence_part_1.append(part_1_diff_3.loc[i, 'content_id'])
    
    sequence_part_2 = []
    for i in i_2_1:
        sequence_part_2.append(part_2_diff_1.loc[i, 'content_id'])
    for i in i_2_2:
        sequence_part_2.append(part_2_diff_2.loc[i, 'content_id'])
    for i in i_2_3:
        sequence_part_2.append(part_2_diff_3.loc[i, 'content_id'])
        
    sequence_part_3 = []
    for i in i_3_1:
        sequence_part_3.append(part_3_diff_1.loc[i, 'content_id'])
    for i in i_3_2:
        sequence_part_3.append(part_3_diff_2.loc[i, 'content_id'])
    for i in i_3_3:
        sequence_part_3.append(part_3_diff_3.loc[i, 'content_id'])
    
    sequence_part_4 = []
    for i in i_4_1:
        sequence_part_4.append(part_4_diff_1.loc[i, 'content_id'])
    for i in i_4_2:
        sequence_part_4.append(part_4_diff_2.loc[i, 'content_id'])
    for i in i_4_3:
        sequence_part_4.append(part_4_diff_3.loc[i, 'content_id'])
        
    sequence_part_5 = []
    for i in i_5_1:
        sequence_part_5.append(part_5_diff_1.loc[i, 'content_id'])
    for i in i_5_2:
        sequence_part_5.append(part_5_diff_2.loc[i, 'content_id'])
    for i in i_5_3:
        sequence_part_5.append(part_5_diff_3.loc[i, 'content_id'])
        
    sequence_part_6 = []
    for i in i_6_1:
        sequence_part_6.append(part_6_diff_1.loc[i, 'content_id'])
    for i in i_6_2:
        sequence_part_6.append(part_6_diff_2.loc[i, 'content_id'])
    for i in i_6_3:
        sequence_part_6.append(part_6_diff_3.loc[i, 'content_id'])
    
    sequence_part_7 = []
    for i in i_7_1:
        sequence_part_7.append(part_7_diff_1.loc[i, 'content_id'])
    for i in i_7_2:
        sequence_part_7.append(part_7_diff_2.loc[i, 'content_id'])
    for i in i_7_3:
        sequence_part_7.append(part_7_diff_3.loc[i, 'content_id'])
    
    return [sequence_part_1,sequence_part_2,sequence_part_3,sequence_part_4,sequence_part_5,sequence_part_6,sequence_part_7]

In [134]:
def get_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [143]:
evolutive_path = get_training_path(textbook, 10, 5)

In [145]:
evolutive_path

array([[[ 7884,  7936,  7926,   195, 10413],
        [ 1057,  1177,   615,  1380, 12240],
        [12378,  2413, 13206,  1705,  1781],
        [13108,  2641, 13120,  3094,  2976],
        [ 9813,  9558,  8213,  6251,  9829],
        [10887, 10015, 10226,  6772, 10320],
        [ 7371,  7042,  7120,  6948,  7390]],

       [[10406, 12795, 12687,    61,    39],
        [12232,   879,   663,   919,   449],
        [ 1634, 12392,  1670,  1842,  2060],
        [ 3107,  2656,  3486,  3208, 12494],
        [ 5316,  8693,  5786,  9213,  4529],
        [10381,  6483, 10307,  6615,  6694],
        [ 7055,  7041,  7305,  7342,  7439]],

       [[ 7949,  7877,    23, 12692, 12810],
        [ 1213,  1396,   321,  1066,   331],
        [11734, 11854,  1664,  1737, 13232],
        [13061, 13163, 11598, 12516, 10692],
        [ 4493,  4656,  5902,  5630,  9535],
        [10876, 11080,  6711, 11150, 10885],
        [ 8122,  7372,  8101,  7046,  8016]],

       [[12715,  7916, 12902, 10461,   157],
    