In [58]:
import pandas as pd
import numpy as np

In [59]:
questions = pd.read_csv('../data/simple_qstats_v2.csv')

In [60]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly
0,1,5.0,100,15386,0.3536
1,1,5.0,12882,161,0.3602
2,1,5.0,7859,47486,0.3812
3,1,5.0,157,33138,0.566
4,1,5.0,10426,8606,0.5681


In [61]:
questions_per_parts = pd.DataFrame(questions.groupby('part')['content_id'].count()).reset_index().set_index('part')

In [62]:
questions_per_parts['ratio %'] = round(questions_per_parts['content_id'] / 13523 * 100, 2)

In [63]:
questions_per_parts

Unnamed: 0_level_0,content_id,ratio %
part,Unnamed: 1_level_1,Unnamed: 2_level_1
1,992,7.34
2,1647,12.18
3,1562,11.55
4,1439,10.64
5,5511,40.75
6,1212,8.96
7,1160,8.58


In [64]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [65]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [66]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly,difficulty
0,1,5.0,100,15386,0.3536,3
1,1,5.0,12882,161,0.3602,3
2,1,5.0,7859,47486,0.3812,3
3,1,5.0,157,33138,0.566,3
4,1,5.0,10426,8606,0.5681,3


In [67]:
questions_per_parts_and_difficulty = pd.DataFrame(questions.groupby(['part','difficulty'])['content_id'].count())

In [68]:
questions_per_parts_and_difficulty['ratio %'] = round(questions_per_parts_and_difficulty['content_id'] / 13523 * 100, 2)

In [69]:
def toeic_questions(df):
    random_n = np.random.randint(0, 13524, 1000)
    return df['content_id'].get(random_n)

In [70]:
toeic_quest = pd.DataFrame(toeic_questions(questions))

In [71]:
toeic_quest = toeic_quest.merge(questions, on="content_id", how="inner")

In [72]:
toeic_quest_nb = pd.DataFrame(toeic_quest.groupby(['part','difficulty'])['content_id'].count())

In [73]:
toeic_quest_nb

Unnamed: 0_level_0,Unnamed: 1_level_0,content_id
part,difficulty,Unnamed: 2_level_1
1,1,39
1,2,24
1,3,12
2,1,22
2,2,66
2,3,27
3,1,40
3,2,49
3,3,36
4,1,45


In [74]:
textbook = pd.DataFrame([question for question in questions.content_id if question not in list(toeic_quest['content_id'])])

In [75]:
toeic_quest['content_id'].nunique()

962

In [76]:
textbook.set_axis(['content_id'], axis='columns', inplace=True)

In [77]:
textbook = textbook.merge(questions, on='content_id', how='inner')

In [78]:
textbook_per_parts_and_difficulty = pd.DataFrame(textbook.groupby(['part','difficulty'])['content_id'].count())

In [79]:
textbook_per_parts_and_difficulty['ratio %'] = round(textbook_per_parts_and_difficulty['content_id'] / 12523 * 100, 2)

In [80]:
comparison = pd.DataFrame(questions_per_parts_and_difficulty.merge(textbook_per_parts_and_difficulty, on=['part','difficulty']))

In [81]:
comparison.columns = ['total','ratio_total','textbook','ratio_textbook']

In [82]:
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,total,ratio_total,textbook,ratio_textbook
part,difficulty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,559,4.13,522,4.17
1,2,295,2.18,271,2.16
1,3,138,1.02,126,1.01
2,1,449,3.32,428,3.42
2,2,796,5.89,733,5.85
2,3,402,2.97,375,2.99
3,1,553,4.09,515,4.11
3,2,591,4.37,543,4.34
3,3,418,3.09,385,3.07
4,1,512,3.79,470,3.75


In [83]:
print('nb questions total :', comparison['total'].sum())
print('nb questions toeic :', toeic_quest['content_id'].nunique())
print('nb questions textbook :', comparison['textbook'].sum())

nb questions total : 13523
nb questions toeic : 962
nb questions textbook : 12561


# Random Textbook

### Selection of random questions equally distributed among each part

In [84]:
def random_sample_sequence(df, n_quest):
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1 = questions[questions['part']==1].reset_index()
    part_2 = questions[questions['part']==2].reset_index()
    part_3 = questions[questions['part']==3].reset_index()
    part_4 = questions[questions['part']==4].reset_index()
    part_5 = questions[questions['part']==5].reset_index()
    part_6 = questions[questions['part']==6].reset_index()
    part_7 = questions[questions['part']==7].reset_index()
    
    i_1 = np.random.randint(0, len(part_1), n_quest)
    i_2 = np.random.randint(0, len(part_2), n_quest)
    i_3 = np.random.randint(0, len(part_3), n_quest)
    i_4 = np.random.randint(0, len(part_4), n_quest)
    i_5 = np.random.randint(0, len(part_5), n_quest)
    i_6 = np.random.randint(0, len(part_6), n_quest)
    i_7 = np.random.randint(0, len(part_7), n_quest)
    
    sequence_part_1 = []
    for i in i_1:
        sequence_part_1.append(part_1.loc[i, 'content_id'])
    
    sequence_part_2 = []
    for i in i_2:
        sequence_part_2.append(part_2.loc[i, 'content_id'])
        
    sequence_part_3 = []
    for i in i_3:
        sequence_part_3.append(part_3.loc[i, 'content_id'])    
    
    sequence_part_4 = []
    for i in i_4:
        sequence_part_4.append(part_4.loc[i, 'content_id'])
        
    sequence_part_5 = []
    for i in i_5:
        sequence_part_5.append(part_5.loc[i, 'content_id']) 
        
    sequence_part_6 = []
    for i in i_6:
        sequence_part_6.append(part_6.loc[i, 'content_id'])
    
    sequence_part_7 = []
    for i in i_7:
        sequence_part_7.append(part_7.loc[i, 'content_id'])
    
    return [sequence_part_1,sequence_part_2,sequence_part_3,sequence_part_4,sequence_part_5,sequence_part_6,sequence_part_7]

In [85]:
def get_random_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = random_sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [94]:
random_path = get_random_training_path(textbook, 1, 10)

In [95]:
random = pd.DataFrame(random_path[0])\
#.merge(pd.DataFrame(random_path[1]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[2]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[3]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[4]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[5]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[6]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[7]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[8]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(random_path[9]), left_index=True, right_index=True)

In [96]:
random = random.T.reset_index(drop=True)

In [97]:
random.columns = ['part_1','part_2','part_3','part_4','part_5','part_6','part_7']

In [98]:
random

Unnamed: 0,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,145,512,2513,11601,6118,6542,7057
1,10591,1069,2010,10729,6346,10204,6957
2,95,11922,2523,3270,8270,10173,7487
3,12898,1008,2015,3336,11330,6585,7689
4,10393,1372,2008,3092,5849,11115,8116
5,7864,12255,2371,2562,11297,6542,7439
6,12728,1126,2127,2991,4227,6710,7579
7,7893,1215,13222,3131,5724,10278,7506
8,0,514,2524,2605,8607,10959,7510
9,8,388,12347,3173,3761,11182,7470


# Evolutive Textbook based on difficulty

### Selection of questions equally distributed among each part & with progressive difficulty

In [99]:
def sample_sequence(df, n_quest):
    
    diff_1 = int(n_quest * 0.4)
    diff_2 = int(n_quest * 0.4)
    diff_3 = int(n_quest * 0.2)
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1_diff_1 = questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()
    part_1_diff_2 = questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()
    part_1_diff_3 = questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()
    part_2_diff_1 = questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()
    part_2_diff_2 = questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()
    part_2_diff_3 = questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()
    part_3_diff_1 = questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()
    part_3_diff_2 = questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()
    part_3_diff_3 = questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()
    part_4_diff_1 = questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()
    part_4_diff_2 = questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()
    part_4_diff_3 = questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()
    part_5_diff_1 = questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()
    part_5_diff_2 = questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()
    part_5_diff_3 = questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()
    part_6_diff_1 = questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()
    part_6_diff_2 = questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()
    part_6_diff_3 = questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()
    part_7_diff_1 = questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()
    part_7_diff_2 = questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()
    part_7_diff_3 = questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()
    
    i_1_1 = np.random.randint(0, len(part_1_diff_1), diff_1)
    i_1_2 = np.random.randint(0, len(part_1_diff_2), diff_2)
    i_1_3 = np.random.randint(0, len(part_1_diff_3), diff_3)
    i_2_1 = np.random.randint(0, len(part_2_diff_1), diff_1)
    i_2_2 = np.random.randint(0, len(part_2_diff_2), diff_2)
    i_2_3 = np.random.randint(0, len(part_2_diff_3), diff_3)
    i_3_1 = np.random.randint(0, len(part_3_diff_1), diff_1)
    i_3_2 = np.random.randint(0, len(part_3_diff_2), diff_2)
    i_3_3 = np.random.randint(0, len(part_3_diff_3), diff_3)
    i_4_1 = np.random.randint(0, len(part_4_diff_1), diff_1)
    i_4_2 = np.random.randint(0, len(part_4_diff_2), diff_2)
    i_4_3 = np.random.randint(0, len(part_4_diff_3), diff_3)
    i_5_1 = np.random.randint(0, len(part_5_diff_1), diff_1)
    i_5_2 = np.random.randint(0, len(part_5_diff_2), diff_2)
    i_5_3 = np.random.randint(0, len(part_5_diff_3), diff_3)
    i_6_1 = np.random.randint(0, len(part_6_diff_1), diff_1)
    i_6_2 = np.random.randint(0, len(part_6_diff_2), diff_2)
    i_6_3 = np.random.randint(0, len(part_6_diff_3), diff_3)
    i_7_1 = np.random.randint(0, len(part_7_diff_1), diff_1)
    i_7_2 = np.random.randint(0, len(part_7_diff_2), diff_2)
    i_7_3 = np.random.randint(0, len(part_7_diff_3), diff_3)
    
    sequence_part_1 = []
    for i in i_1_1:
        sequence_part_1.append(part_1_diff_1.loc[i, 'content_id'])
    for i in i_1_2:
        sequence_part_1.append(part_1_diff_2.loc[i, 'content_id'])
    for i in i_1_3:
        sequence_part_1.append(part_1_diff_3.loc[i, 'content_id'])
    
    sequence_part_2 = []
    for i in i_2_1:
        sequence_part_2.append(part_2_diff_1.loc[i, 'content_id'])
    for i in i_2_2:
        sequence_part_2.append(part_2_diff_2.loc[i, 'content_id'])
    for i in i_2_3:
        sequence_part_2.append(part_2_diff_3.loc[i, 'content_id'])
        
    sequence_part_3 = []
    for i in i_3_1:
        sequence_part_3.append(part_3_diff_1.loc[i, 'content_id'])
    for i in i_3_2:
        sequence_part_3.append(part_3_diff_2.loc[i, 'content_id'])
    for i in i_3_3:
        sequence_part_3.append(part_3_diff_3.loc[i, 'content_id'])
    
    sequence_part_4 = []
    for i in i_4_1:
        sequence_part_4.append(part_4_diff_1.loc[i, 'content_id'])
    for i in i_4_2:
        sequence_part_4.append(part_4_diff_2.loc[i, 'content_id'])
    for i in i_4_3:
        sequence_part_4.append(part_4_diff_3.loc[i, 'content_id'])
        
    sequence_part_5 = []
    for i in i_5_1:
        sequence_part_5.append(part_5_diff_1.loc[i, 'content_id'])
    for i in i_5_2:
        sequence_part_5.append(part_5_diff_2.loc[i, 'content_id'])
    for i in i_5_3:
        sequence_part_5.append(part_5_diff_3.loc[i, 'content_id'])
        
    sequence_part_6 = []
    for i in i_6_1:
        sequence_part_6.append(part_6_diff_1.loc[i, 'content_id'])
    for i in i_6_2:
        sequence_part_6.append(part_6_diff_2.loc[i, 'content_id'])
    for i in i_6_3:
        sequence_part_6.append(part_6_diff_3.loc[i, 'content_id'])
    
    sequence_part_7 = []
    for i in i_7_1:
        sequence_part_7.append(part_7_diff_1.loc[i, 'content_id'])
    for i in i_7_2:
        sequence_part_7.append(part_7_diff_2.loc[i, 'content_id'])
    for i in i_7_3:
        sequence_part_7.append(part_7_diff_3.loc[i, 'content_id'])
    
    return [sequence_part_1,sequence_part_2,sequence_part_3,sequence_part_4,sequence_part_5,sequence_part_6,sequence_part_7]

In [100]:
def get_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [101]:
evolutive_path = get_training_path(textbook, 1, 10)

In [102]:
evolutive = pd.DataFrame(evolutive_path[0])\
#.merge(pd.DataFrame(evolutive_path[1]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[2]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[3]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[4]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[5]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[6]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[7]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[8]), left_index=True, right_index=True)\
#.merge(pd.DataFrame(evolutive_path[9]), left_index=True, right_index=True)

In [103]:
evolutive = evolutive.T.reset_index(drop=True)

In [104]:
evolutive.columns = ['part_1','part_2','part_3','part_4','part_5','part_6','part_7']

In [105]:
evolutive

Unnamed: 0,part_1,part_2,part_3,part_4,part_5,part_6,part_7
0,10479,12077,2549,12435,5814,10298,7199
1,10445,836,12283,2723,6398,10887,7256
2,10571,793,1689,2888,6098,6502,7318
3,12868,12208,12981,13122,5411,11026,7114
4,54,1366,2389,3540,4364,6772,7525
5,10572,1261,1487,3111,5559,10875,7463
6,7976,949,11747,2565,9210,10231,7442
7,145,429,2316,3155,4482,6749,7345
8,12648,526,2259,2632,11720,10924,7029
9,12873,651,1929,3148,4456,6500,7768
