In [188]:
import pandas as pd
import numpy as np

In [189]:
questions = pd.read_csv('../data/simple_qstats_v2.csv')

In [190]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly
0,1,5.0,100,15386,0.3536
1,1,5.0,12882,161,0.3602
2,1,5.0,7859,47486,0.3812
3,1,5.0,157,33138,0.566
4,1,5.0,10426,8606,0.5681


In [191]:
questions_per_parts = pd.DataFrame(questions.groupby('part')['content_id'].count()).reset_index().set_index('part')

In [192]:
questions_per_parts['ratio %'] = round(questions_per_parts['content_id'] / 13523 * 100, 2)

In [193]:
questions_per_parts

Unnamed: 0_level_0,content_id,ratio %
part,Unnamed: 1_level_1,Unnamed: 2_level_1
1,992,7.34
2,1647,12.18
3,1562,11.55
4,1439,10.64
5,5511,40.75
6,1212,8.96
7,1160,8.58


In [194]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [195]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [196]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly,difficulty
0,1,5.0,100,15386,0.3536,3
1,1,5.0,12882,161,0.3602,3
2,1,5.0,7859,47486,0.3812,3
3,1,5.0,157,33138,0.566,3
4,1,5.0,10426,8606,0.5681,3


In [197]:
questions_per_parts_and_difficulty = pd.DataFrame(questions.groupby(['part','difficulty'])['content_id'].count())

In [198]:
questions_per_parts_and_difficulty['ratio %'] = round(questions_per_parts_and_difficulty['content_id'] / 13523 * 100, 2)

In [199]:
def toeic_questions(df):
    random_n = np.random.randint(0, 13524, 1000)
    return df['content_id'].get(random_n)

In [200]:
toeic_quest = pd.DataFrame(toeic_questions(questions))

In [201]:
toeic_quest = toeic_quest.merge(questions, on="content_id", how="inner")

In [202]:
toeic_quest_nb = pd.DataFrame(toeic_quest.groupby(['part','difficulty'])['content_id'].count())

In [203]:
toeic_quest_nb

Unnamed: 0_level_0,Unnamed: 1_level_0,content_id
part,difficulty,Unnamed: 2_level_1
1,1,42
1,2,21
1,3,19
2,1,27
2,2,67
2,3,31
3,1,42
3,2,42
3,3,34
4,1,33


In [204]:
textbook = pd.DataFrame([question for question in questions.content_id if question not in list(toeic_quest['content_id'])])

In [205]:
toeic_quest['content_id'].nunique()

965

In [206]:
textbook.set_axis(['content_id'], axis='columns', inplace=True)

In [207]:
textbook = textbook.merge(questions, on='content_id', how='inner')

In [208]:
textbook_per_parts_and_difficulty = pd.DataFrame(textbook.groupby(['part','difficulty'])['content_id'].count())

In [209]:
textbook_per_parts_and_difficulty['ratio %'] = round(textbook_per_parts_and_difficulty['content_id'] / 12523 * 100, 2)

In [210]:
comparison = pd.DataFrame(questions_per_parts_and_difficulty.merge(textbook_per_parts_and_difficulty, on=['part','difficulty']))

In [211]:
comparison.columns = ['total','ratio_total','textbook','ratio_textbook']

In [212]:
comparison

Unnamed: 0_level_0,Unnamed: 1_level_0,total,ratio_total,textbook,ratio_textbook
part,difficulty,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,559,4.13,518,4.14
1,2,295,2.18,274,2.19
1,3,138,1.02,119,0.95
2,1,449,3.32,423,3.38
2,2,796,5.89,733,5.85
2,3,402,2.97,374,2.99
3,1,553,4.09,511,4.08
3,2,591,4.37,550,4.39
3,3,418,3.09,384,3.07
4,1,512,3.79,480,3.83


In [213]:
print('nb questions total :', comparison['total'].sum())
print('nb questions toeic :', toeic_quest['content_id'].nunique())
print('nb questions textbook :', comparison['textbook'].sum())

nb questions total : 13523
nb questions toeic : 965
nb questions textbook : 12558


# Random Textbook

### Selection of random questions equally distributed among each part

In [214]:
def random_sample_sequence(df, n_quest):
    
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1 = questions[questions['part']==1].reset_index()
    part_2 = questions[questions['part']==2].reset_index()
    part_3 = questions[questions['part']==3].reset_index()
    part_4 = questions[questions['part']==4].reset_index()
    part_5 = questions[questions['part']==5].reset_index()
    part_6 = questions[questions['part']==6].reset_index()
    part_7 = questions[questions['part']==7].reset_index()
    
    i_1 = np.random.randint(0, len(part_1), n_quest)
    i_2 = np.random.randint(0, len(part_2), n_quest)
    i_3 = np.random.randint(0, len(part_3), n_quest)
    i_4 = np.random.randint(0, len(part_4), n_quest)
    i_5 = np.random.randint(0, len(part_5), n_quest)
    i_6 = np.random.randint(0, len(part_6), n_quest)
    i_7 = np.random.randint(0, len(part_7), n_quest)
    
    sequence_part = []
    for i in i_1:
        sequence_part.append(part_1.loc[i, 'content_id'])
    
    for i in i_2:
        sequence_part.append(part_2.loc[i, 'content_id'])
        
    for i in i_3:
        sequence_part.append(part_3.loc[i, 'content_id'])    

    for i in i_4:
        sequence_part.append(part_4.loc[i, 'content_id'])
        
    for i in i_5:
        sequence_part.append(part_5.loc[i, 'content_id']) 
        
    for i in i_6:
        sequence_part.append(part_6.loc[i, 'content_id'])
    
    for i in i_7:
        sequence_part.append(part_7.loc[i, 'content_id'])
    
    return np.array(sequence_part)

In [215]:
def get_random_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = random_sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [219]:
random_path = get_random_training_path(textbook, 24, 30)

In [221]:
random_path.shape

(24, 210)

In [222]:
sequence_random = pd.DataFrame(random_path.reshape(5040,))

In [223]:
sequence_random.columns = ['content_id']

In [233]:
sequence_random_merged = sequence_random.merge(questions, on="content_id", how='left')

In [234]:
sequence_random_merged

Unnamed: 0,content_id,part,tag_lecture,qstats_count,qstats_answered_correctly,difficulty
0,12705,1,61.0,194,0.2835,3
1,12766,1,99.0,178,0.9719,1
2,10636,1,9.0,6044,0.8375,1
3,10633,1,130.0,7538,0.8768,1
4,7883,1,178.0,11122,0.9301,1
5,54,1,9.0,9500,0.7114,2
6,12702,1,9.0,179,0.7039,2
7,10618,1,5.0,5645,0.9725,1
8,10559,1,51.0,6324,0.9058,1
9,10665,1,104.0,5982,0.9285,1


# Evolutive Textbook based on difficulty

### Selection of questions equally distributed among each part & with progressive difficulty

In [225]:
def sample_sequence(df, n_quest):
  
    questions = pd.DataFrame(df.groupby(['part','content_id','difficulty'])['tag_lecture'].count()).reset_index()
    
    part_1_diff_1 = questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()
    part_1_diff_2 = questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()
    part_1_diff_3 = questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()
    part_2_diff_1 = questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()
    part_2_diff_2 = questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()
    part_2_diff_3 = questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()
    part_3_diff_1 = questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()
    part_3_diff_2 = questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()
    part_3_diff_3 = questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()
    part_4_diff_1 = questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()
    part_4_diff_2 = questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()
    part_4_diff_3 = questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()
    part_5_diff_1 = questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()
    part_5_diff_2 = questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()
    part_5_diff_3 = questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()
    part_6_diff_1 = questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()
    part_6_diff_2 = questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()
    part_6_diff_3 = questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()
    part_7_diff_1 = questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()
    part_7_diff_2 = questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()
    part_7_diff_3 = questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()
    
    i_1_1 = np.random.randint(0, len(part_1_diff_1), n_quest)
    i_1_2 = np.random.randint(0, len(part_1_diff_2), n_quest)
    i_1_3 = np.random.randint(0, len(part_1_diff_3), n_quest)
    i_2_1 = np.random.randint(0, len(part_2_diff_1), n_quest)
    i_2_2 = np.random.randint(0, len(part_2_diff_2), n_quest)
    i_2_3 = np.random.randint(0, len(part_2_diff_3), n_quest)
    i_3_1 = np.random.randint(0, len(part_3_diff_1), n_quest)
    i_3_2 = np.random.randint(0, len(part_3_diff_2), n_quest)
    i_3_3 = np.random.randint(0, len(part_3_diff_3), n_quest)
    i_4_1 = np.random.randint(0, len(part_4_diff_1), n_quest)
    i_4_2 = np.random.randint(0, len(part_4_diff_2), n_quest)
    i_4_3 = np.random.randint(0, len(part_4_diff_3), n_quest)
    i_5_1 = np.random.randint(0, len(part_5_diff_1), n_quest)
    i_5_2 = np.random.randint(0, len(part_5_diff_2), n_quest)
    i_5_3 = np.random.randint(0, len(part_5_diff_3), n_quest)
    i_6_1 = np.random.randint(0, len(part_6_diff_1), n_quest)
    i_6_2 = np.random.randint(0, len(part_6_diff_2), n_quest)
    i_6_3 = np.random.randint(0, len(part_6_diff_3), n_quest)
    i_7_1 = np.random.randint(0, len(part_7_diff_1), n_quest)
    i_7_2 = np.random.randint(0, len(part_7_diff_2), n_quest)
    i_7_3 = np.random.randint(0, len(part_7_diff_3), n_quest)
    
    sequence_part_diff_1 = []
    for i in i_1_1:
        sequence_part_diff_1.append(part_1_diff_1.loc[i, 'content_id'])
    for i in i_2_1:
        sequence_part_diff_1.append(part_2_diff_1.loc[i, 'content_id'])    
    for i in i_3_1:
        sequence_part_diff_1.append(part_3_diff_1.loc[i, 'content_id'])
    for i in i_4_1:
        sequence_part_diff_1.append(part_4_diff_1.loc[i, 'content_id'])
    for i in i_5_1:
        sequence_part_diff_1.append(part_5_diff_1.loc[i, 'content_id'])
    for i in i_6_1:
        sequence_part_diff_1.append(part_6_diff_1.loc[i, 'content_id'])
    for i in i_7_1:
        sequence_part_diff_1.append(part_7_diff_1.loc[i, 'content_id'])
       
    sequence_part_diff_2 = []
    for i in i_1_2:
        sequence_part_diff_2.append(part_1_diff_2.loc[i, 'content_id'])
    for i in i_2_2:
        sequence_part_diff_2.append(part_2_diff_2.loc[i, 'content_id'])
    for i in i_3_2:
        sequence_part_diff_2.append(part_3_diff_2.loc[i, 'content_id'])
    for i in i_4_2:
        sequence_part_diff_2.append(part_4_diff_2.loc[i, 'content_id'])
    for i in i_5_2:
        sequence_part_diff_2.append(part_5_diff_2.loc[i, 'content_id'])
    for i in i_6_2:
        sequence_part_diff_2.append(part_6_diff_2.loc[i, 'content_id'])
    for i in i_7_2:
        sequence_part_diff_2.append(part_7_diff_2.loc[i, 'content_id'])
        
    sequence_part_diff_3 = []
    for i in i_1_3:
        sequence_part_diff_3.append(part_1_diff_3.loc[i, 'content_id'])
    for i in i_2_3:
        sequence_part_diff_3.append(part_2_diff_3.loc[i, 'content_id'])
    for i in i_3_3:
        sequence_part_diff_3.append(part_3_diff_3.loc[i, 'content_id'])
    for i in i_4_3:
        sequence_part_diff_3.append(part_4_diff_3.loc[i, 'content_id'])    
    for i in i_5_3:
        sequence_part_diff_3.append(part_5_diff_3.loc[i, 'content_id'])
    for i in i_6_3:
        sequence_part_diff_3.append(part_6_diff_3.loc[i, 'content_id'])
    for i in i_7_3:
        sequence_part_diff_3.append(part_7_diff_3.loc[i, 'content_id'])

    
    return [sequence_part_diff_1,sequence_part_diff_2,sequence_part_diff_3]

In [226]:
def get_training_path(df, number_of_sequences, n_quest):
    X = []

    for i in range(number_of_sequences):
        xi = sample_sequence(df, n_quest)
        X.append(xi)
        
    X = np.array(X)
        
    return X

In [227]:
evolutive_path = get_training_path(textbook, 8, 30)

In [228]:
evolutive_path.shape

(8, 3, 210)

In [229]:
sequence_sorted = pd.DataFrame(evolutive_path.reshape(24,210).reshape(5040,))

In [230]:
sequence_sorted.columns = ['content_id']

In [231]:
sequence_sorted_merged = sequence_sorted.merge(questions, on='content_id', how='left')

In [232]:
sequence_sorted_merged

Unnamed: 0,content_id,part,tag_lecture,qstats_count,qstats_answered_correctly,difficulty
0,12852,1,93.0,158,0.9177,1
1,10578,1,5.0,5701,0.9533,1
2,12891,1,187.0,154,0.9870,1
3,73,1,93.0,7388,0.8506,1
4,10594,1,68.0,6438,0.9163,1
5,7959,1,111.0,3081,0.8734,1
6,166,1,93.0,9609,0.8731,1
7,10472,1,9.0,5854,0.9318,1
8,12891,1,187.0,154,0.9870,1
9,127,1,13.0,8478,0.9079,1


In [238]:
sequence_random_merged.to_csv('sequence_random.csv', index=False)

In [239]:
sequence_sorted_merged.to_csv('sequence_sorted.csv', index=False)