In [1]:
import pandas as pd
import numpy as np
import random

# Import Questions Stats

In [2]:
questions = pd.read_csv('../data/qstats_for_M1')

In [3]:
questions['interval'] = pd.cut(questions['qstats_answered_correctly'], 5)

In [4]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [5]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [6]:
questions.groupby(['difficulty','part'])['content_id'].count()

difficulty  part
1           1        559
            2        449
            3        553
            4        512
            5        807
            6        337
            7        340
2           1        295
            2        796
            3        590
            4        465
            5       2304
            6        461
            7        454
3           1        138
            2        402
            3        419
            4        462
            5       2400
            6        414
            7        366
Name: content_id, dtype: int64

In [7]:
questions.groupby('interval')['content_id'].count()

interval
(-0.001, 0.2]      34
(0.2, 0.4]        695
(0.4, 0.6]       2454
(0.6, 0.8]       5771
(0.8, 1.0]       4569
Name: content_id, dtype: int64

In [8]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 13 columns):
Unnamed: 0                                                      13523 non-null int64
content_id                                                      13523 non-null int64
bundle_id                                                       13523 non-null int64
correct_answer                                                  13523 non-null int64
part                                                            13523 non-null int64
tags                                                            13522 non-null object
qstats_count                                                    13523 non-null int64
qstats_answered_correctly                                       13523 non-null float64
qstats_prior_question_had_explanation                           13523 non-null float64
qstats_answered_correctly_knowing_having_had_explanation        13523 non-null float64
qstats_answered_correctly_knowing_

# Import Model

In [9]:
import pickle
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
my_pipeline = pickle.load(open('../models/xgboost_pipe_M1.pkl',"rb"))

In [10]:
pipeline_features_list=pd.read_csv('../models/xgboost_pipe_M1_features_list')

In [11]:
[pipeline_features_list.feature.to_list()]

[['user_personalized_qstat_knowing_had_explanation_or_not',
  'user_avg_score_cum_on_this_part',
  'user_correct_answers_cum_on_this_part',
  'qstats_answered_correctly',
  'user_avg_score_cum',
  'qstats_prior_question_had_explanation',
  'already_seen',
  'user_activity_cumcount',
  'at_least_one_lesson']]

In [12]:
qstats=pd.read_csv('../data/qstats_for_M1')

# Initializing new student

In [13]:
user_history_empty=pd.DataFrame({#following columns are the impute of each loop
                             ### TO BE IMPUTED ###
                             'content_id':[-1],
                             'content_type_id':[-1],
                             'prior_question_had_explanation':False,
                             # following columns depend of previous history of the user : 
                             ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
                             'user_activity_cumcount':[-1],
                             'at_least_one_lesson':[0],
                             ### TO BE UPDATED IF LAST WAS QUESTION , ELSE COPIED ###
                             'user_avg_score_cum':[0.499],
                             'user_correct_answers_cum':[0],
                             'user_avg_score_cum_part1':[0.499],
                             'user_avg_score_cum_part2':[0.499],
                             'user_avg_score_cum_part3':[0.499],
                             'user_avg_score_cum_part4':[0.499],
                             'user_avg_score_cum_part5':[0.499],
                             'user_avg_score_cum_part6':[0.499],
                             'user_avg_score_cum_part7':[0.499],
                             'user_correct_answers_cum_part1':[0],
                             'user_correct_answers_cum_part2':[0],
                             'user_correct_answers_cum_part3':[0],
                             'user_correct_answers_cum_part4':[0],
                             'user_correct_answers_cum_part5':[0],
                             'user_correct_answers_cum_part6':[0],
                             'user_correct_answers_cum_part7':[0],
                             # following columns are pure question stats : 
                             ### TO BE IMPORTED ###
                             'part':[-1],
                             'qstats_answered_correctly':[-1],
                             'qstats_prior_question_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_not_had_explanation':[-1],
                             # following columns depend of the current question AND the hisory of user
                             ### TO BE COMPUTED ###
                             'user_personalized_qstat_knowing_had_explanation_or_not':[-1],
                             'already_seen':[-1],
                             'user_avg_score_cum_on_this_part':[-1],
                             'user_correct_answers_cum_on_this_part':[-1],
                             # the following line is the prediction to be made
                             ### TO BE PREDICTED ###
                             'answered_correctly':[-1]
                          })

# Import Functions

In [14]:
def user_history_update(content_type_id,
                        content_id,
                        data_qstats,
                        user_history=None,
                        mode='training',    #autre choix : mode exam
                        prior_question_had_explanation=False):
    '''Crée ou met à jour l'hisorique d'un utilisateur, stockée dans un df'''

    if not type(user_history)==pd.DataFrame:
        user_history=pd.DataFrame({
                             #following columns are the impute of each loop
                             ### TO BE IMPUTED ###
                             'content_id':[-1],
                             'content_type_id':[-1],
                             'prior_question_had_explanation':False,
                             'mode':'n/a',
                             # following columns depend of previous history of the user : 
                             ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
                             'user_activity_cumcount':[-1],
                             ### TO BE UPDATED IF LAST WAS LECTURE ###
                             'at_least_one_lesson':[0],
                             ### TO BE UPDATED IF LAST WAS QUESTION ###
                             'user_avg_score_cum':[0.499],
                             'user_correct_answers_cum':[0],
                             'user_avg_score_cum_part1':[0.499],
                             'user_avg_score_cum_part2':[0.499],
                             'user_avg_score_cum_part3':[0.499],
                             'user_avg_score_cum_part4':[0.499],
                             'user_avg_score_cum_part5':[0.499],
                             'user_avg_score_cum_part6':[0.499],
                             'user_avg_score_cum_part7':[0.499],
                             'user_correct_answers_cum_part1':[0],
                             'user_correct_answers_cum_part2':[0],
                             'user_correct_answers_cum_part3':[0],
                             'user_correct_answers_cum_part4':[0],
                             'user_correct_answers_cum_part5':[0],
                             'user_correct_answers_cum_part6':[0],
                             'user_correct_answers_cum_part7':[0],
                             # following columns are pure question stats : 
                             ### TO BE IMPORTED FROM QUESTIONS ###
                             'part':[-1],
                             'qstats_answered_correctly':[-1],
                             'qstats_prior_question_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_not_had_explanation':[-1],
                             # following columns depend of the current question AND the hisory of user
                             ### TO BE COMPUTED ###
                             'user_personalized_qstat_knowing_had_explanation_or_not':[-1],
                             'already_seen':[-1],
                             'user_avg_score_cum_on_this_part':[-1],
                             'user_correct_answers_cum_on_this_part':[-1],
                             # the following line is the prediction to be made
                             ### TO BE PREDICTED ###
                             'answered_correctly':[-1]
                          })
    

    last_line=user_history.iloc[-1]
    new_line =last_line.copy()
    
    last_content_type_id=user_history.iloc[-1]['content_type_id']
    
    ### TO BE IMPUTED ###
    new_line['content_id']=content_id
    new_line['content_type_id']=content_type_id
    new_line['prior_question_had_explanation']=prior_question_had_explanation
    new_line['mode']=mode
    ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
    new_line['user_activity_cumcount'] += 1
    
    if mode=='training':
        if last_content_type_id==0:
            part=last_line['part']
            ### TO BE UPDATED IF LAST WAS QUESTION ###
            new_line['user_correct_answers_cum'] = last_line['user_correct_answers_cum']\
                                                 + last_line['answered_correctly']
            new_user_questions_count             = last_line['user_correct_answers_cum']\
                                                 / last_line['user_avg_score_cum']\
                                                 + 1
            new_line['user_avg_score_cum']       = new_line['user_correct_answers_cum']\
                                                 / new_user_questions_count

            new_line[f'user_correct_answers_cum_part{part}'] = last_line[f'user_correct_answers_cum_part{part}']\
                                                             + last_line['answered_correctly']
            vars()[f'new_user_questions_count_part{part}']   = last_line[f'user_correct_answers_cum_part{part}']\
                                                             / last_line[f'user_avg_score_cum_part{part}']\
                                                             + 1
            new_line[f'user_avg_score_cum_part{part}']       = new_line[f'user_correct_answers_cum_part{part}']\
                                                             / vars()[f'new_user_questions_count_part{part}']

        elif last_content_type_id==1:
            ### TO BE UPDATED IF LAST WAS LECTURE ###
            new_line['at_least_one_lesson']=1
        
    if content_type_id==0:        
        currect_question_stats=qstats.loc[qstats.content_id==content_id].iloc[-1]
        ### TO BE IMPORTED FROM QUESTIONS ###
        new_line['part']\
              = currect_question_stats['part']
        new_line['qstats_answered_correctly']\
              = currect_question_stats['qstats_answered_correctly']
        new_line['qstats_prior_question_had_explanation']\
              = currect_question_stats['qstats_prior_question_had_explanation']
        new_line['qstats_answered_correctly_knowing_having_had_explanation']\
              = currect_question_stats['qstats_answered_correctly_knowing_having_had_explanation']
        new_line['qstats_answered_correctly_knowing_having_not_had_explanation']\
              = currect_question_stats['qstats_answered_correctly_knowing_having_not_had_explanation']
        ### TO BE COMPUTED ###
        new_line['user_personalized_qstat_knowing_had_explanation_or_not']\
              = new_line['qstats_answered_correctly_knowing_having_had_explanation']\
             if prior_question_had_explanation\
           else new_line['qstats_answered_correctly_knowing_having_not_had_explanation']
        new_line['already_seen']\
              = 1 if content_id in user_history.loc[user_history.content_type_id==0,'content_id']\
           else 0
        new_line['user_avg_score_cum_on_this_part']=new_line[f'user_avg_score_cum_part{new_line["part"]}']
        new_line['user_correct_answers_cum_on_this_part']=new_line[f'user_correct_answers_cum_part{new_line["part"]}']
                                              
    elif content_type_id==1:
        
        ### TO BE IMPORTED ###
        new_line['part']= -1 # TODO : si on veut utiliser la partie de la lecture, il faut importer la base des lectures
        new_line['qstats_answered_correctly']= -1
        new_line['qstats_prior_question_had_explanation']= -1
        new_line['qstats_answered_correctly_knowing_having_had_explanation']= -1
        new_line['qstats_answered_correctly_knowing_having_not_had_explanation']= -1
        ### TO BE COMPUTED ###
        new_line['user_personalized_qstat_knowing_had_explanation_or_not']= -1
        new_line['already_seen']= -1
        new_line['user_avg_score_cum_on_this_part']= -1
        new_line['user_correct_answers_cum_on_this_part']= -1
        
    ### TO BE PREDICTED ###
    new_line['answered_correctly']= -1
                                
    user_history=user_history.append(new_line,ignore_index=True)

    return user_history

In [37]:
def TOEIC_scoring (user_history, number_of_questions=100, TOEIC_strategy='random'):
    for i in range(number_of_questions):
        ### CHOIX DE LA QUESTION ###
        if TOEIC_strategy=='random':
            next_question_id=random.choice(qstats.content_id.to_list())


        user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         mode='exam',
                                         prior_question_had_explanation=False)

        ### PREDICTION ###
        user_history.iloc[-1,-1]\
            = my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]
    
    return user_history.iloc[-number_of_questions:].answered_correctly.mean()

# Defining questions selection strategy

In [16]:
def test_questions(answers_start):
    
    """ First round of test questions to assess the level of the student
        21 questions : 3 randomly selected questions per part for each level of difficulty"""
    
    questions_test = []
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,559,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,449,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,553,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,512,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,807,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,337,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,340,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,295,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,796,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,590,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,465,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,2304,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,461,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,454,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,138,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,402,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,419,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,462,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,2400,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,414,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,366,1), 'content_id'].values)
    
    questions_test = [item for sublist in questions_test for item in sublist]
        
    user_history=None
    
    for i in range(21):
        question_id=questions_test[i]
        user_history=user_history_update(0,
                                 question_id,
                                 qstats,
                                 user_history,
                                 mode='training',
                                 prior_question_had_explanation=False)
        user_history.loc[i+1, 'answered_correctly'] = answers_start[i]
    
    return user_history

In [51]:
answers_start = np.random.randint(0,2,21) ### random answers for new student

In [55]:
def knowledge_tracing_path(answers_start, n_quest=50):
    """ Profile of the student initially determined based on his 21 answers, then reassessment and question selection
    based on the model prediction for next answer """
    
    start_profile = test_questions(answers_start)
    user_history = start_profile
    evaluation_start = start_profile.iloc[-len(answers_start):].answered_correctly.mean() ### TOEIC scoring
    
    if evaluation_start < 0.5:
        profile = 'beginner'
    elif evaluation_start < 0.75:
        profile = 'intermediate'
    else:
        profile = 'expert'
    
    
    while len(user_history) <= n_quest+len(answers_start):
 
        if profile == 'beginner':
            if my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.50:
                next_question_id = int(questions[questions['difficulty']==1].reset_index()\
                                                .loc[np.random.randint(0,3557,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True)
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

            elif my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.75:
                next_question_id = int(questions[questions['difficulty']==2].reset_index()\
                                                .loc[np.random.randint(0,5368,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True)
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

            else:
                next_question_id = int(questions[questions['difficulty']==3].reset_index()\
                                                .loc[np.random.randint(0,4598,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True)
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

        elif profile == 'intermediate':
            if my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.75:
                next_question_id = int(questions[questions['difficulty']==2].reset_index()\
                                                .loc[np.random.randint(0,5368,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True)
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

            else:
                next_question_id = int(questions[questions['difficulty']==3].reset_index()\
                                                .loc[np.random.randint(0,4598,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True) 
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

        else:
                next_question_id = int(questions[questions['difficulty']==3].reset_index()\
                                                .loc[np.random.randint(0,4598,1), 'content_id'].values)
                user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         prior_question_had_explanation=True)
                user_history.loc[(len(user_history)-1), 'answered_correctly'] = \
                        my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]

    
    ### PREDICTION ###
    user_history.iloc[-1,-1]\
        = my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]
    
    return user_history

In [69]:
user_history = knowledge_tracing_path(answers_start, n_quest=50)



In [70]:
user_history.tail()

Unnamed: 0,content_id,content_type_id,prior_question_had_explanation,mode,user_activity_cumcount,at_least_one_lesson,user_avg_score_cum,user_correct_answers_cum,user_avg_score_cum_part1,user_avg_score_cum_part2,...,part,qstats_answered_correctly,qstats_prior_question_had_explanation,qstats_answered_correctly_knowing_having_had_explanation,qstats_answered_correctly_knowing_having_not_had_explanation,user_personalized_qstat_knowing_had_explanation_or_not,already_seen,user_avg_score_cum_on_this_part,user_correct_answers_cum_on_this_part,answered_correctly
67,4260,0,True,training,66,0,0.613889,40.5167,0.810787,,...,5,0.749644,0.98363,0.750362,0.706522,0.750362,0,,17.469081,0.716657
68,13372,0,True,training,67,0,0.615423,41.233357,0.810787,,...,5,0.811966,0.995116,0.81227,0.75,0.81227,0,,18.185738,0.748552
69,4947,0,True,training,68,0,0.617381,41.981909,0.810787,,...,5,0.690153,0.945671,0.70018,0.515625,0.70018,0,,18.93429,0.820296
70,1306,0,True,training,69,0,0.620322,42.802205,0.810787,,...,2,0.591813,0.949708,0.591133,0.604651,0.591133,0,,6.611615,0.686282
71,479,0,True,training,70,0,0.621264,43.488487,0.810787,,...,2,0.72099,0.980987,0.720658,0.738095,0.720658,0,,7.297897,0.567418


# TOEIC scoring

In [40]:
TOEIC_test_set = pd.read_csv('toeic_question.csv')

In [93]:
TOEIC_selection = [random.choice(TOEIC_test_set.content_id) for i in range(100)]

In [94]:
TOEIC_df = pd.DataFrame(TOEIC_selection)

In [95]:
TOEIC_df.columns = ['content_id']

In [96]:
TOEIC_df.merge(questions, on="content_id").groupby(['part','difficulty'])['content_id'].count()

part  difficulty
1     1              5
      2              1
      3              1
2     1              4
      2              7
      3              3
3     1              5
      2              9
      3              2
4     1              3
      2              2
      3              3
5     1              6
      2             20
      3             16
6     1              1
      2              3
      3              2
7     1              2
      2              3
      3              2
Name: content_id, dtype: int64

In [47]:
def TOEIC_scoring_2 (user_history, number_of_questions=100):
    for i in range(number_of_questions):
        ### CHOIX DE LA QUESTION ###
        next_question_id=TOEIC_selection[i]

        user_history=user_history_update(0,
                                         next_question_id,
                                         qstats,
                                         user_history,
                                         mode='exam',
                                         prior_question_had_explanation=False)

        ### PREDICTION ###
        user_history.iloc[-1,-1]\
            = my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1]
    
    return user_history.iloc[-number_of_questions:].answered_correctly.mean()

In [99]:
TOEIC_scoring_2(user_history)

0.6804301981621