In [2]:
import pandas as pd
import numpy as np

In [3]:
questions = pd.read_csv('../data/simple_qstats_v2.csv')

In [4]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly
0,1,5.0,100,15386,0.3536
1,1,5.0,12882,161,0.3602
2,1,5.0,7859,47486,0.3812
3,1,5.0,157,33138,0.566
4,1,5.0,10426,8606,0.5681


In [5]:
questions_per_parts = pd.DataFrame(questions.groupby('part')['content_id'].count()).reset_index().set_index('part')

In [6]:
questions_per_parts['ratio %'] = round(questions_per_parts['content_id'] / 13523 * 100, 2)

In [7]:
questions_per_parts

Unnamed: 0_level_0,content_id,ratio %
part,Unnamed: 1_level_1,Unnamed: 2_level_1
1,992,7.34
2,1647,12.18
3,1562,11.55
4,1439,10.64
5,5511,40.75
6,1212,8.96
7,1160,8.58


In [8]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [9]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [10]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly,difficulty
0,1,5.0,100,15386,0.3536,3
1,1,5.0,12882,161,0.3602,3
2,1,5.0,7859,47486,0.3812,3
3,1,5.0,157,33138,0.566,3
4,1,5.0,10426,8606,0.5681,3


In [11]:
questions_per_parts_and_difficulty = pd.DataFrame(questions.groupby(['part','difficulty'])['content_id'].count())

In [12]:
questions_per_parts_and_difficulty['ratio %'] = round(questions_per_parts_and_difficulty['content_id'] / 13523 * 100, 2)

In [13]:
def toeic_questions(df):
    random_n = np.random.randint(0, 13524, 1000)
    return df['content_id'].get(random_n)

In [14]:
toeic_quest = pd.DataFrame(toeic_questions(questions))

In [15]:
toeic_quest = toeic_quest.merge(questions, on="content_id", how="inner")

In [16]:
toeic_quest_nb = pd.DataFrame(toeic_quest.groupby(['part','difficulty'])['content_id'].count())

In [17]:
toeic_quest_nb

Unnamed: 0_level_0,Unnamed: 1_level_0,content_id
part,difficulty,Unnamed: 2_level_1
1,1,41
1,2,13
1,3,9
2,1,32
2,2,55
2,3,33
3,1,32
3,2,46
3,3,24
4,1,36


In [18]:
textbook = pd.DataFrame([question for question in questions.content_id if question not in list(toeic_quest['content_id'])])

In [19]:
toeic_quest['content_id'].nunique()

963

In [20]:
textbook.set_axis(['content_id'], axis='columns', inplace=True)

In [21]:
textbook = textbook.merge(questions, on='content_id', how='inner')

In [22]:
textbook_per_parts_and_difficulty = pd.DataFrame(textbook.groupby(['part','difficulty'])['content_id'].count())

In [23]:
textbook_per_parts_and_difficulty['ratio %'] = round(textbook_per_parts_and_difficulty['content_id'] / 12523 * 100, 2)

In [24]:
comparison = pd.DataFrame(questions_per_parts_and_difficulty.merge(textbook_per_parts_and_difficulty, on=['part','difficulty']))

In [25]:
comparison.columns = ['total','ratio_total','textbook','ratio_textbook']

In [26]:
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,total,ratio_total,textbook,ratio_textbook
part,difficulty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,559,4.13,522,4.17
1,2,295,2.18,282,2.25
1,3,138,1.02,129,1.03
2,1,449,3.32,417,3.33
2,2,796,5.89,742,5.93
2,3,402,2.97,370,2.95
3,1,553,4.09,522,4.17
3,2,591,4.37,548,4.38
3,3,418,3.09,394,3.15
4,1,512,3.79,476,3.8


In [27]:
print('nb questions total :', comparison['total'].sum())
print('nb questions toeic :', toeic_quest['content_id'].nunique())
print('nb questions textbook :', comparison['textbook'].sum())

nb questions total : 13523
nb questions toeic : 963
nb questions textbook : 12560


# Random Textbook

### Selection of random questions equally distributed among each part

In [28]:
def random_sample_sequence(df, n_quest):
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1 = questions[questions['part']==1].reset_index()
    part_2 = questions[questions['part']==2].reset_index()
    part_3 = questions[questions['part']==3].reset_index()
    part_4 = questions[questions['part']==4].reset_index()
    part_5 = questions[questions['part']==5].reset_index()
    part_6 = questions[questions['part']==6].reset_index()
    part_7 = questions[questions['part']==7].reset_index()
    
    i_1 = np.random.randint(0, len(part_1), n_quest)
    i_2 = np.random.randint(0, len(part_2), n_quest)
    i_3 = np.random.randint(0, len(part_3), n_quest)
    i_4 = np.random.randint(0, len(part_4), n_quest)
    i_5 = np.random.randint(0, len(part_5), n_quest)
    i_6 = np.random.randint(0, len(part_6), n_quest)
    i_7 = np.random.randint(0, len(part_7), n_quest)
    
    sequence_part = []
    for i in i_1:
        sequence_part.append(part_1.loc[i, 'content_id'])
    
    for i in i_2:
        sequence_part.append(part_2.loc[i, 'content_id'])
        
    for i in i_3:
        sequence_part.append(part_3.loc[i, 'content_id'])    

    for i in i_4:
        sequence_part.append(part_4.loc[i, 'content_id'])
        
    for i in i_5:
        sequence_part.append(part_5.loc[i, 'content_id']) 
        
    for i in i_6:
        sequence_part.append(part_6.loc[i, 'content_id'])
    
    for i in i_7:
        sequence_part.append(part_7.loc[i, 'content_id'])
    
    return np.array(sequence_part)

In [29]:
def get_random_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = random_sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [30]:
random_path = get_random_training_path(textbook, 24, 30)

In [31]:
random_path.shape

(24, 210)

In [32]:
sequence_random = pd.DataFrame(random_path.reshape(5040,))

In [33]:
sequence_random.columns = ['content_id']

In [34]:
sequence_random_merged = sequence_random.merge(questions, on="content_id", how='left')

In [35]:
sequence_random_merged

Unnamed: 0,content_id,part,tag_lecture,qstats_count,qstats_answered_correctly,difficulty
0,10616,1,93.0,14479,0.3321,3
1,10507,1,94.0,6277,0.8025,2
2,7855,1,9.0,7241,0.8187,2
3,12692,1,61.0,171,0.8246,2
4,12876,1,9.0,158,0.8544,1
5,161,1,40.0,12178,0.9144,1
6,12804,1,130.0,169,0.9645,1
7,12627,1,104.0,171,0.8830,1
8,66,1,104.0,6433,0.9375,1
9,7883,1,178.0,11122,0.9301,1


# Evolutive Textbook based on difficulty

### Selection of questions equally distributed among each part & with progressive difficulty

In [36]:
def sample_sequence(df, n_quest):
  
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1_diff_1 = questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()
    part_1_diff_2 = questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()
    part_1_diff_3 = questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()
    part_2_diff_1 = questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()
    part_2_diff_2 = questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()
    part_2_diff_3 = questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()
    part_3_diff_1 = questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()
    part_3_diff_2 = questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()
    part_3_diff_3 = questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()
    part_4_diff_1 = questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()
    part_4_diff_2 = questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()
    part_4_diff_3 = questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()
    part_5_diff_1 = questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()
    part_5_diff_2 = questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()
    part_5_diff_3 = questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()
    part_6_diff_1 = questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()
    part_6_diff_2 = questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()
    part_6_diff_3 = questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()
    part_7_diff_1 = questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()
    part_7_diff_2 = questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()
    part_7_diff_3 = questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()
    
    i_1_1 = np.random.randint(0, len(part_1_diff_1), n_quest)
    i_1_2 = np.random.randint(0, len(part_1_diff_2), n_quest)
    i_1_3 = np.random.randint(0, len(part_1_diff_3), n_quest)
    i_2_1 = np.random.randint(0, len(part_2_diff_1), n_quest)
    i_2_2 = np.random.randint(0, len(part_2_diff_2), n_quest)
    i_2_3 = np.random.randint(0, len(part_2_diff_3), n_quest)
    i_3_1 = np.random.randint(0, len(part_3_diff_1), n_quest)
    i_3_2 = np.random.randint(0, len(part_3_diff_2), n_quest)
    i_3_3 = np.random.randint(0, len(part_3_diff_3), n_quest)
    i_4_1 = np.random.randint(0, len(part_4_diff_1), n_quest)
    i_4_2 = np.random.randint(0, len(part_4_diff_2), n_quest)
    i_4_3 = np.random.randint(0, len(part_4_diff_3), n_quest)
    i_5_1 = np.random.randint(0, len(part_5_diff_1), n_quest)
    i_5_2 = np.random.randint(0, len(part_5_diff_2), n_quest)
    i_5_3 = np.random.randint(0, len(part_5_diff_3), n_quest)
    i_6_1 = np.random.randint(0, len(part_6_diff_1), n_quest)
    i_6_2 = np.random.randint(0, len(part_6_diff_2), n_quest)
    i_6_3 = np.random.randint(0, len(part_6_diff_3), n_quest)
    i_7_1 = np.random.randint(0, len(part_7_diff_1), n_quest)
    i_7_2 = np.random.randint(0, len(part_7_diff_2), n_quest)
    i_7_3 = np.random.randint(0, len(part_7_diff_3), n_quest)
    
    sequence_part_diff_1 = []
    for i in i_1_1:
        sequence_part_diff_1.append(part_1_diff_1.loc[i, 'content_id'])
    for i in i_2_1:
        sequence_part_diff_1.append(part_2_diff_1.loc[i, 'content_id'])    
    for i in i_3_1:
        sequence_part_diff_1.append(part_3_diff_1.loc[i, 'content_id'])
    for i in i_4_1:
        sequence_part_diff_1.append(part_4_diff_1.loc[i, 'content_id'])
    for i in i_5_1:
        sequence_part_diff_1.append(part_5_diff_1.loc[i, 'content_id'])
    for i in i_6_1:
        sequence_part_diff_1.append(part_6_diff_1.loc[i, 'content_id'])
    for i in i_7_1:
        sequence_part_diff_1.append(part_7_diff_1.loc[i, 'content_id'])
       
    sequence_part_diff_2 = []
    for i in i_1_2:
        sequence_part_diff_2.append(part_1_diff_2.loc[i, 'content_id'])
    for i in i_2_2:
        sequence_part_diff_2.append(part_2_diff_2.loc[i, 'content_id'])
    for i in i_3_2:
        sequence_part_diff_2.append(part_3_diff_2.loc[i, 'content_id'])
    for i in i_4_2:
        sequence_part_diff_2.append(part_4_diff_2.loc[i, 'content_id'])
    for i in i_5_2:
        sequence_part_diff_2.append(part_5_diff_2.loc[i, 'content_id'])
    for i in i_6_2:
        sequence_part_diff_2.append(part_6_diff_2.loc[i, 'content_id'])
    for i in i_7_2:
        sequence_part_diff_2.append(part_7_diff_2.loc[i, 'content_id'])
        
    sequence_part_diff_3 = []
    for i in i_1_3:
        sequence_part_diff_3.append(part_1_diff_3.loc[i, 'content_id'])
    for i in i_2_3:
        sequence_part_diff_3.append(part_2_diff_3.loc[i, 'content_id'])
    for i in i_3_3:
        sequence_part_diff_3.append(part_3_diff_3.loc[i, 'content_id'])
    for i in i_4_3:
        sequence_part_diff_3.append(part_4_diff_3.loc[i, 'content_id'])    
    for i in i_5_3:
        sequence_part_diff_3.append(part_5_diff_3.loc[i, 'content_id'])
    for i in i_6_3:
        sequence_part_diff_3.append(part_6_diff_3.loc[i, 'content_id'])
    for i in i_7_3:
        sequence_part_diff_3.append(part_7_diff_3.loc[i, 'content_id'])

    
    return [sequence_part_diff_1,sequence_part_diff_2,sequence_part_diff_3]

In [37]:
def get_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [38]:
evolutive_path = get_training_path(textbook, 8, 30)

In [39]:
evolutive_path.shape

(8, 3, 210)

In [40]:
sequence_sorted = pd.DataFrame(evolutive_path.reshape(24,210).reshape(5040,))

In [41]:
sequence_sorted.columns = ['content_id']

In [42]:
sequence_sorted_merged = sequence_sorted.merge(questions, on='content_id', how='left')

In [43]:
sequence_sorted_merged

Unnamed: 0,content_id,part,tag_lecture,qstats_count,qstats_answered_correctly,difficulty
0,111,1,101.0,12231,0.8527,1
1,12656,1,93.0,153,0.9739,1
2,12684,1,61.0,168,0.9286,1
3,43,1,13.0,8622,0.9284,1
4,7982,1,9.0,6925,0.8660,1
5,10605,1,68.0,6987,0.8845,1
6,10466,1,130.0,5812,0.9326,1
7,10553,1,99.0,6651,0.8645,1
8,84,1,36.0,8368,0.8384,1
9,70,1,51.0,6537,0.9674,1


In [44]:
sequence_random_merged.to_csv('sequence_random.csv', index=False)

In [46]:
sequence_sorted_merged.to_csv('sequence_sorted.csv', index=False)

In [47]:
toeic_quest.to_csv('toeic_question.csv', index=False)