In [51]:
import pandas as pd
import numpy as np

In [52]:
questions = pd.read_csv('../data/simple_qstats_v2.csv')

In [53]:
questions['interval'] = pd.cut(questions['qstats_answered_correctly'], 5)

In [54]:
def difficulty(question):
    if question < 0.66:
        difficulty = 3
    elif question < 0.83:
        difficulty = 2
    else:
        difficulty = 1
    return difficulty

In [55]:
questions['difficulty'] = questions['qstats_answered_correctly'].apply(difficulty)

In [56]:
questions.head()

Unnamed: 0,part,tag_lecture,content_id,qstats_count,qstats_answered_correctly,interval,difficulty
0,1,5.0,100,15386,0.3536,"(0.2, 0.4]",3
1,1,5.0,12882,161,0.3602,"(0.2, 0.4]",3
2,1,5.0,7859,47486,0.3812,"(0.2, 0.4]",3
3,1,5.0,157,33138,0.566,"(0.4, 0.6]",3
4,1,5.0,10426,8606,0.5681,"(0.4, 0.6]",3


In [81]:
questions.groupby(['difficulty'])['content_id'].count()

difficulty
1    3557
2    5368
3    4598
Name: content_id, dtype: int64

In [58]:
questions.groupby('interval')['content_id'].count()

interval
(-0.001, 0.2]      34
(0.2, 0.4]        695
(0.4, 0.6]       2454
(0.6, 0.8]       5772
(0.8, 1.0]       4568
Name: content_id, dtype: int64

In [59]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 7 columns):
part                         13523 non-null int64
tag_lecture                  13522 non-null float64
content_id                   13523 non-null int64
qstats_count                 13523 non-null int64
qstats_answered_correctly    13523 non-null float64
interval                     13523 non-null category
difficulty                   13523 non-null int64
dtypes: category(1), float64(2), int64(4)
memory usage: 647.3 KB


In [60]:
import pickle
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
my_pipeline = pickle.load(open('../models/xgboost_pipe_M1.pkl',"rb"))

In [61]:
pipeline_features_list=pd.read_csv('../models/xgboost_pipe_M1_features_list')

In [62]:
[pipeline_features_list.feature.to_list()]

[['user_personalized_qstat_knowing_had_explanation_or_not',
  'user_avg_score_cum_on_this_part',
  'user_correct_answers_cum_on_this_part',
  'qstats_answered_correctly',
  'user_avg_score_cum',
  'qstats_prior_question_had_explanation',
  'already_seen',
  'user_activity_cumcount',
  'at_least_one_lesson']]

In [63]:
qstats=pd.read_csv('../data/qstats_for_M1')

In [64]:
user_history_empty=pd.DataFrame({#following columns are the impute of each loop
                             ### TO BE IMPUTED ###
                             'content_id':[-1],
                             'content_type_id':[-1],
                             'prior_question_had_explanation':False,
                             # following columns depend of previous history of the user : 
                             ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
                             'user_activity_cumcount':[-1],
                             'at_least_one_lesson':[0],
                             ### TO BE UPDATED IF LAST WAS QUESTION , ELSE COPIED ###
                             'user_avg_score_cum':[0.499],
                             'user_correct_answers_cum':[0],
                             'user_avg_score_cum_part1':[0.499],
                             'user_avg_score_cum_part2':[0.499],
                             'user_avg_score_cum_part3':[0.499],
                             'user_avg_score_cum_part4':[0.499],
                             'user_avg_score_cum_part5':[0.499],
                             'user_avg_score_cum_part6':[0.499],
                             'user_avg_score_cum_part7':[0.499],
                             'user_correct_answers_cum_part1':[0],
                             'user_correct_answers_cum_part2':[0],
                             'user_correct_answers_cum_part3':[0],
                             'user_correct_answers_cum_part4':[0],
                             'user_correct_answers_cum_part5':[0],
                             'user_correct_answers_cum_part6':[0],
                             'user_correct_answers_cum_part7':[0],
                             # following columns are pure question stats : 
                             ### TO BE IMPORTED ###
                             'part':[-1],
                             'qstats_answered_correctly':[-1],
                             'qstats_prior_question_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_had_explanation':[-1],
                             'qstats_answered_correctly_knowing_having_not_had_explanation':[-1],
                             # following columns depend of the current question AND the hisory of user
                             ### TO BE COMPUTED ###
                             'user_personalized_qstat_knowing_had_explanation_or_not':[-1],
                             'already_seen':[-1],
                             'user_avg_score_cum_on_this_part':[-1],
                             'user_correct_answers_cum_on_this_part':[-1],
                             # the following line is the prediction to be made
                             ### TO BE PREDICTED ###
                             'answered_correctly':[-1]
                          })

In [65]:
def user_history_update(content_type_id,
                        content_id,
                        data_qstats,
                        user_history=None,
                        prior_question_had_explanation=False):
    '''Crée ou met à jour l'hisorique d'un utilisateur, stockée dans un df'''
    
    if not type(user_history)==pd.DataFrame:
        user_history=user_history_empty
    
    last_line=user_history.iloc[-1]
    new_line =last_line.copy()
    
    last_content_type_id=user_history.iloc[-1]['content_type_id']
    
    ### TO BE IMPUTED ###
    new_line['content_id']=content_id
    new_line['content_type_id']=content_type_id
    new_line['prior_question_had_explanation']=prior_question_had_explanation
    ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
    new_line['user_activity_cumcount'] = last_line['user_activity_cumcount'] + 1
    new_line['at_least_one_lesson'] = last_line['at_least_one_lesson']

    if last_content_type_id==0:
        part=last_line['part']
        ### TO BE UPDATED IF LAST WAS QUESTION , ELSE COPIED ###
        new_line['user_correct_answers_cum'] = last_line['user_correct_answers_cum']\
                                             + last_line['answered_correctly']
        new_user_questions_count             = last_line['user_correct_answers_cum']\
                                             / last_line['user_avg_score_cum']\
                                             + 1
        new_line['user_avg_score_cum']       = new_line['user_correct_answers_cum']\
                                             / new_user_questions_count

        new_line[f'user_correct_answers_cum_part{int(part)}'] = last_line[f'user_correct_answers_cum_part{int(part)}']\
                                                         + last_line['answered_correctly']
        vars()[f'new_user_questions_count_part{int(part)}']   = last_line[f'user_correct_answers_cum_part{int(part)}']\
                                                         / last_line[f'user_avg_score_cum_part{int(part)}']\
                                                         + 1
        new_line[f'user_avg_score_cum_part{int(part)}']       = new_line[f'user_correct_answers_cum_part{int(part)}']\
                                                         / vars()[f'new_user_questions_count_part{int(part)}']                
        
    if content_type_id==0:        
        currect_question_stats=qstats.loc[qstats.content_id==content_id].iloc[-1]
        ### TO BE IMPORTED ###
        new_line['part']\
              = currect_question_stats['part']
        new_line['qstats_answered_correctly']\
              = currect_question_stats['qstats_answered_correctly']
        new_line['qstats_prior_question_had_explanation']\
              = currect_question_stats['qstats_prior_question_had_explanation']
        new_line['qstats_answered_correctly_knowing_having_had_explanation']\
              = currect_question_stats['qstats_answered_correctly_knowing_having_had_explanation']
        new_line['qstats_answered_correctly_knowing_having_not_had_explanation']\
              = currect_question_stats['qstats_answered_correctly_knowing_having_not_had_explanation']
        ### TO BE COMPUTED ###
        new_line['user_personalized_qstat_knowing_had_explanation_or_not']\
              = new_line['qstats_answered_correctly_knowing_having_had_explanation']\
             if prior_question_had_explanation\
           else new_line['qstats_answered_correctly_knowing_having_not_had_explanation']
        new_line['already_seen']\
              = 1 if content_id in user_history.loc[user_history.content_type_id==0,'content_id']\
           else 0
        new_line['user_avg_score_cum_on_this_part']=new_line[f'user_avg_score_cum_part{new_line["part"]}']
        new_line['user_correct_answers_cum_on_this_part']=new_line[f'user_correct_answers_cum_part{new_line["part"]}']
                                              
    elif content_type_id==1:
        ### TO BE UPDATED WHATEVER THE CONTENT_TYPE ###
        new_line['at_least_one_lesson']=1
        ### TO BE IMPORTED ###
        new_line['part']= -1 # TODO : si on veut utiliser la partie de la lecture, il faut importer la base des lectures
        new_line['qstats_answered_correctly']= -1
        new_line['qstats_prior_question_had_explanation']= -1
        new_line['qstats_answered_correctly_knowing_having_had_explanation']= -1
        new_line['qstats_answered_correctly_knowing_having_not_had_explanation']= -1
        ### TO BE COMPUTED ###
        new_line['user_personalized_qstat_knowing_had_explanation_or_not']= -1
        new_line['already_seen']= -1
        new_line['user_avg_score_cum_on_this_part']= -1
        new_line['user_correct_answers_cum_on_this_part']= -1
        
    ### TO BE PREDICTED ###
    new_line['answered_correctly']= -1
                                
    user_history=user_history.append(new_line,ignore_index=True)

    return user_history

In [72]:
def test_questions(answers_start):
    
    """ First round of test questions to assess the level of the student """
    
    questions_test = []
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,559,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,449,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,553,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,512,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,807,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,337,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==1)].reset_index()\
                                            .loc[np.random.randint(0,340,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,295,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,796,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,591,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,465,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,2306,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,461,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==2)].reset_index()\
                                            .loc[np.random.randint(0,454,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==1) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,138,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==2) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,402,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==3) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,418,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==4) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,462,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==5) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,2398,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==6) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,414,1), 'content_id'].values)
    questions_test.append(questions[(questions['part']==7) & (questions['difficulty']==3)].reset_index()\
                                            .loc[np.random.randint(0,366,1), 'content_id'].values)
    
    questions_test = [item for sublist in questions_test for item in sublist]
        
    user_history=None
    
    for i in range(21):
        question_id=questions_test[i]
        user_history=user_history_update(0,
                                 question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
        user_history.loc[i+1, 'answered_correctly'] = answers_start[i]
    
    return user_history

In [84]:
def knowledge_tracing_path(answers_start, n_quest):
    start_profile = test_questions(answers_start)
    user_history = start_profile
    evaluation_start = toeic(start_profile)
    if evaluation_start < 0.5:
        profile = beginner
    elif evaluation_start < 0.75:
        profile = intermediate
    else:
        profile = expert
    
    if profile == beginner:
        if my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.50:
            next_question_id = questions[questions['difficulty']==1].reset_index()\
                                            .loc[np.random.randint(0,3557,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
            
        elif my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.75:
            next_question_id = questions[questions['difficulty']==2].reset_index()\
                                            .loc[np.random.randint(0,5368,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
        else:
            next_question_id = questions[questions['difficulty']==3].reset_index()\
                                            .loc[np.random.randint(0,4598,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
            
    if profile == intermediate:
        if my_pipeline.predict_proba(user_history[pipeline_features_list.feature.to_list()].iloc[-2:-1])[0,1] < 0.75:
            next_question_id = questions[questions['difficulty']==2].reset_index()\
                                            .loc[np.random.randint(0,5368,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
        else:
            next_question_id = questions[questions['difficulty']==3].reset_index()\
                                            .loc[np.random.randint(0,4598,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)        
    
    else:
            next_question_id = questions[questions['difficulty']==3].reset_index()\
                                            .loc[np.random.randint(0,4598,1), 'content_id'].values
            user_history=user_history_update(0,
                                 next_question_id,
                                 qstats,
                                 user_history,
                                 prior_question_had_explanation=False)
    
    pass

In [77]:
test_questions(np.random.randint(0,2,21))



Unnamed: 0,content_id,content_type_id,prior_question_had_explanation,user_activity_cumcount,at_least_one_lesson,user_avg_score_cum,user_correct_answers_cum,user_avg_score_cum_part1,user_avg_score_cum_part2,user_avg_score_cum_part3,...,part,qstats_answered_correctly,qstats_prior_question_had_explanation,qstats_answered_correctly_knowing_having_had_explanation,qstats_answered_correctly_knowing_having_not_had_explanation,user_personalized_qstat_knowing_had_explanation_or_not,already_seen,user_avg_score_cum_on_this_part,user_correct_answers_cum_on_this_part,answered_correctly
0,-1,-1,False,-1,0,0.499,0,0.499,0.499,0.499,...,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1,-1
1,10539,0,False,0,0,0.499,0,0.499,0.499,0.499,...,1,0.918702,0.98951,0.918668,0.921875,0.921875,0,0.499,0,1
2,1166,0,False,1,0,1.0,1,1.0,0.499,0.499,...,2,0.83489,0.980458,0.83578,0.790244,0.790244,0,0.499,0,1
3,2406,0,False,2,0,1.0,2,1.0,1.0,0.499,...,3,0.885714,0.977778,0.883117,1.0,1.0,0,0.499,0,0
4,3459,0,False,3,0,0.666667,2,1.0,1.0,0.0,...,4,0.842636,0.96779,0.841209,0.885496,0.885496,0,0.499,0,1
5,8926,0,False,4,0,0.75,3,1.0,1.0,0.0,...,5,0.839427,0.992731,0.839805,0.787879,0.787879,0,0.499,0,1
6,6485,0,False,5,0,0.8,4,1.0,1.0,0.0,...,6,0.926997,0.955997,0.928571,0.892794,0.892794,0,0.499,0,0
7,7541,0,False,6,0,0.666667,4,1.0,1.0,0.0,...,7,0.960212,0.965075,0.958772,1.0,1.0,0,0.499,0,1
8,12749,0,False,7,0,0.714286,5,1.0,1.0,0.0,...,1,0.807229,0.981928,0.809816,0.666667,0.666667,0,1.0,1,1
9,870,0,False,8,0,0.75,6,1.0,1.0,0.0,...,2,0.806625,0.984046,0.807488,0.753363,0.753363,0,1.0,1,0


user.user_activity_cumcount = 0
    user.user_avg_score_cum = 0
    while user.user_activity_cumcount < 1000:
        quest