In [None]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
%matplotlib inline

## How to use

Make sure you go trough all the blocks to initialize necessary functions

In [2]:
def word_extraction(sentence):
    ignore = set(stopwords.words('english'))
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = '' 
    for w in words:
        if w not in ignore:
            cleaned_text += w.lower() + ' '
    return cleaned_text

In [3]:
def choose_type(q_id):
    ans_data = df_ans[df_ans['Question_id'] == q_id]
    q_row = df_qns[df_qns['Question_id'] == q_id].to_dict(orient='records')[0]
    correct_answer = q_row['Correct_answer_choice']
    num_students = len(ans_data)
    
    num_correct = len(ans_data[ans_data['Student_score_on_question'] == 1])
    #If a few students answered correctly, we go for MC as we may not have enough correct options for SA
    if num_correct < 2:
        return 'MC'

    if q_row['Question_type'] == 'MC':
        
        num_correct = q_row['Students_select_' + q_row['Correct_answer_choice']]
        #If no students answered correctly, we go for MC as we have one correct option in df_qns
        if num_correct == 0:
            return 'MC'
        
        tmp = np.unique(ans_data['Student_choice_on_question'].values, return_counts=True)
        clusters = dict(zip(tmp[0], tmp[1]))
        if len(clusters) >= 3:
            # If students gave enough incorrect answers (from at least 2 different clusters), we go for MC
            return 'MC'
        else:
            # If students' icorrect answers are all related to the same option, we go for SA
            return 'SA'

    if q_row['Question_type'] == 'SA':
        
        tmp = np.unique(ans_data['Student_choice_on_question'].values, return_counts=True)
        clusters = dict(zip(tmp[0], tmp[1]))
        
        subset_of_correct = 0
        subset_of_incorrect = 0
        num_part_correct = 0
        for x in clusters:
            if all([y in correct_answer.split(',') for y in x.split(', ')]):
                subset_of_correct += 1
                num_part_correct += clusters[x]
            if not any([y in correct_answer.split(',') for y in x.split(',')]):
                subset_of_incorrect += 1

        if(subset_of_correct == 1 & num_part_correct < num_students / 4):
            return 'MC'
        else:
            return 'SA'        

In [4]:
def sample_candidates(target, candidates, k, regime = 'dir'):
        
    candidates_post = [x for x in candidates if len(x.split(' ')) > 5]
    if len(candidates_post) < k:
        candidates_post = candidates
        
    candidates_shorten = [word_extraction(x) for x in candidates_post]
    
    i1s = [opt_dict[word_extraction(t)] for t in target]
    i2s = [ans_dict[x] for x in candidates_shorten]
    distances = sum([cosine_similarities[i1, i2s] for i1 in i1s]) + 0.01
    distances = distances / sum(distances)
    
    if regime == 'inv':
        distances = (1 - distances) / (len(distances) - 1)
    
    flag = False
    if len(candidates_post) < k:
        flag = True
    
    options = np.random.choice([i for i in range(len(distances))], k, replace=flag, p=distances)
    options_texts = [candidates_post[x] for x in options]
    
    return options_texts

In [5]:
def select_options_mcmc(q_id):
    #correct options goes as a feedback
    
    ans_data = df_ans[df_ans['Question_id'] == q_id].reset_index(drop='True')
    q_row = df_qns[df_qns['Question_id'] == q_id].to_dict(orient='records')[0]
    correct_answer = q_row['Correct_answer_choice']
    num_students = len(ans_data)
    
        
    # We use correct option as a feedback in case student answers question incorrectly twice. We also show this
    # if a student answered question correctly
    feedback = [q_row['Choice_' + correct_answer + '_text']]


    #################Choosing correct options#######################
    # Now we need to choose 2 correct options for 2 retry

    correct_options_texts = sample_candidates(feedback, \
                      ans_data[ans_data['Student_score_on_question'] == 1]['Answer_text'].values, 2)

    #################Choosing incorrect options#######################

    incorrect_groups = np.unique(ans_data['Student_choice_on_question'].values)
    incorrect_groups = [x for x in incorrect_groups if x != correct_answer]

    choices = {x : ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values 
               for x in incorrect_groups}
    lens = [len(x) for x in choices]

    if len(choices) == 3 :
        incorrect_options_texts1 = sample_candidates([q_row['Choice_' + x + '_text']], choices[x], 1) 
                                    
    elif len(choices) == 2:
        if lens[0] > lens[1]:
            x = incorrect_groups[0]
            y = incorrect_groups[1]
        else:
            y = incorrect_groups[0]
            x = incorrect_groups[1]
        incorrect_options_texts1 = sample_candidates([q_row['Choice_' + x + '_text']], choices[x], 2) 
        incorrect_options_texts1 += sample_candidates([q_row['Choice_' + y + '_text']], choices[y], 1)
    else:
        x = incorrect_groups[0]
        incorrect_options_texts1 = sample_candidates([q_row['Choice_' + x + '_text']], choices[x], 3)

    incorrect_options_texts2 = [q_row['Choice_' + x  + '_text'] for x in ['A', 'B', 'C', 'D'] if x != correct_answer]

    return {'attempt1.correct': [correct_options_texts[0]],
            'attempt2.correct': [correct_options_texts[1]], 
            'attempt1.wrong': incorrect_options_texts1,
            'attempt2.wrong': incorrect_options_texts2, 
            'feedback': feedback}

In [6]:
def select_options_sasa(q_id):
    #correct options goes as a feedback
    
    ans_data = df_ans[df_ans['Question_id'] == q_id].reset_index(drop='True')
    q_row = df_qns[df_qns['Question_id'] == q_id].to_dict(orient='records')[0]
    correct_answer = q_row['Correct_answer_choice'].split(',')
    num_students = len(ans_data)
    
        
    # We use correct option as a feedback in case student answers question incorrectly twice. We also show this
    # if a student answered question correctly
    feedback = [q_row['Choice_' + x + '_text'] for x in correct_answer]
    correct = []
    wrong = []
    partial = []
    options_at_hand =  np.unique(ans_data['Student_choice_on_question'].values)
    for x in options_at_hand:
        if all([y in correct_answer for y in x.split(', ')]):
            correct += [x]
        elif not any([y in correct_answer for y in x.split(', ')]):
            wrong += [x]
        else:
            partial += [x]
            
    c_count = sum([len(ans_data[ans_data['Student_choice_on_question'] == x]) for x in correct])
    
    if c_count > num_students*0.8:
        n_corr = 3
    else:
        n_corr = 2
    
    ##############################Correct Options###################################
            
    if len(correct) == 1:
        correct_options_texts = sample_candidates([feedback],
                      ans_data[ans_data['Student_score_on_question'] == correct[0]]['Answer_text'].values, 2*n_corr)
                        
    if len(correct) == 2:
        cl1 = ans_data[ans_data['Student_choice_on_question'] == correct[0]]['Answer_text'].values
        cl2 = ans_data[ans_data['Student_choice_on_question'] == correct[1]]['Answer_text'].values
        
        if(len(cl1) < len(cl2)):
            cl1, cl2 = cl2, cl1
        
        if(len(cl2) < n_corr):
            idx1 = 2*n_corr - len(cl2)
            idx2 = len(cl2)
        else:
            idx1 = n_corr
            idx2 = n_corr
            
        
        correct_options_texts = sample_candidates([q_row['Choice_' + x + '_text'] 
                                                   for x in correct[0].split(', ')], cl1, idx1)
        correct_options_texts += sample_candidates([q_row['Choice_' + x + '_text'] 
                                                    for x in correct[1].split(', ')], cl2, idx2)
        
    if len(correct) == 3:
        cl = [ans_data[ans_data['Student_choice_on_question'] == correct[i]]['Answer_text'].values for i in range(3)]
        ind = sorted([0, 1, 2], key = lambda x: len(cl[x]), reverse=True)
        
        correct_options_texts = sample_candidates([q_row['Choice_' + x + '_text']
                                                     for x in correct[ind[0]].split(', ')], cl[ind[0]], 2*n_corr - 3)
        correct_options_texts += sample_candidates([q_row['Choice_' + x + '_text']
                                                     for x in correct[ind[1]].split(', ')], cl[ind[1]], 2)
        correct_options_texts += sample_candidates([q_row['Choice_' + x + '_text']
                                                     for x in correct[ind[2]].split(', ')], cl[ind[2]], 1)
    
    
    np.random.shuffle(correct_options_texts)
    
    
    ##############################Incorrect Options##################################
    
    incorrect_options_texts2 = [q_row['Choice_' + x  + '_text'] for x in ['A', 'B', 'C', 'D'] 
                                if x not in correct_answer]
    if n_corr == 3:
        return{'attempt1.correct':  correct_options_texts[:n_corr], 
               'attempt2.correct':  correct_options_texts[n_corr:],  
               'attempt1.wrong': [incorrect_options_texts2[0]], 
               'attempt2.wrong': [incorrect_options_texts2[1]], 
               'feedback': feedback}

    wrong_options_texts = []

    if len(wrong) > 0:
        comp_wrong_options = []
        for x in wrong:
            comp_wrong_options += list(ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values)
        if len(comp_wrong_options) > 1:
            wrong_options_texts = np.random.choice(comp_wrong_options, 4-n_corr, replace=False)
        else:
            wrong_options_texts = [comp_wrong_options[0]]

            
    if len(wrong_options_texts) < 4-n_corr:
        r_opt = 4-n_corr - len(wrong_options_texts)
        part_wrong_options = []
        for x in partial:
            part_wrong_options += list(ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values)
        wrong_options_texts += sample_candidates(incorrect_options_texts2, part_wrong_options, r_opt)
            
    return{'attempt1.correct': correct_options_texts[0:n_corr],
           'attempt2.correct': correct_options_texts[n_corr:],
           'attempt1.wrong': wrong_options_texts,
           'attempt2.wrong': incorrect_options_texts2,
           'feedback': feedback}

In [7]:
def select_options_mcsa(q_id):
    ans_data = df_ans[df_ans['Question_id'] == q_id].reset_index(drop='True')
    q_row = df_qns[df_qns['Question_id'] == q_id].to_dict(orient='records')[0]
    correct_answer = q_row['Correct_answer_choice'].split(',')
    num_students = len(ans_data)
    
    # We use correct option as a feedback in case student answers question incorrectly twice. We also show this
    # if a student answered question correctly
    feedback = [q_row['Choice_' + x + '_text'] for x in correct_answer]
    
    ##############################Chosing Correct Option################################
    
    cor_ans = ans_data[ans_data['Student_score_on_question'] == 1]['Answer_text'].values
    cor_ans = [x for x in cor_ans if len(x.split(' ')) > 5]
    cor_ans = sample_candidates(feedback, cor_ans, 4)
    
    ##############################Choosing Incorrect Option#############################
    
    incor_ans = ans_data[ans_data['Student_score_on_question'] == 0]['Answer_text'].values
    incor_ans = [x for x in incor_ans if len(x.split(' ')) > 5]
    incor_ans = sample_candidates(feedback, incor_ans, 1, 'inv')
    
    wrong_opt = [q_row['Choice_' + x + '_text'] for x in ['A', 'B', 'C', 'D'] if x not in correct_answer]
    
    incor_ans += [wrong_opt[0]]
    
    return{'attempt1.correct': cor_ans[0:2],
           'attempt2.correct': cor_ans[2:],
           'attempt1.wrong': incor_ans,
           'attempt2.wrong': wrong_opt[1:],
           'feedback': feedback}

In [8]:
def select_options_samc(q_id):
    ans_data = df_ans[df_ans['Question_id'] == q_id].reset_index(drop='True')
    q_row = df_qns[df_qns['Question_id'] == q_id].to_dict(orient='records')[0]
    correct_answer = q_row['Correct_answer_choice'].split(',')
    num_students = len(ans_data)
    
    feedback = [q_row['Choice_' + x + '_text'] for x in correct_answer]
    
    tmp = np.unique(ans_data['Student_choice_on_question'].values, return_counts=True)
    clusters = dict(zip(tmp[0], tmp[1]))
                
    subset_of_correct = []
    subset_of_incorrect = []
    subset_part = []
    num_part = []
    num_correct = 0
    num_incorrect = 0
    for x in clusters:
        if all([y in correct_answer for y in x.split(', ')]):
            subset_of_correct += [x]
            num_correct += clusters[x]
        elif not any([y in correct_answer for y in x.split(',')]):
            subset_of_incorrect += [x]
            num_incorrect += clusters[x]
        else:
            subset_part += [x]
            num_part += clusters[x]
            
    
    
    ##############################Chosing Correct Option################################

    
    cor_cand = []
    
    for x in subset_of_correct:
        cor_cand += list(ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values)
                    
    correct_options_texts = sample_candidates(feedback, cor_cand, 2)
    
    ##############################Chosing inorrect Option################################
    
    part_cand = []
    wrong_cand = []
    
    for x in subset_part:
        part_cand += list(ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values)
        
    for x in subset_of_incorrect:
        wrong_cand += list(ans_data[ans_data['Student_choice_on_question'] == x]['Answer_text'].values)
        
    wrong_options_given = [q_row['Choice_' + x + '_text'] for x in ['A', 'B', 'C', 'D'] if x not in correct_answer]
    
    wrong_options_text = []
    if len(wrong_cand) >= 4:
        wrong_options_text += sample_candidates(wrong_options_given, wrong_cand, 4)
    elif len(wrong_cand) >= 0:
        wrong_options_text += wrong_cand
        
    r_opt = 4 - len(wrong_options_text)
    
    if r_opt > 0:
        wrong_options_text += sample_candidates(wrong_options_given, part_cand, r_opt)
    
    
    
    return {'attempt1.correct': [correct_options_texts[0]], 
            'attempt2.correct': [correct_options_texts[1]], 
            'attempt1.wrong': wrong_options_text[:3],
            'attempt2.wrong': wrong_options_given + [wrong_options_text[3]],
            'feedback': feedback}

In [9]:
def print_questions(q):
    text = '';
    for i in q:
        cur_q = q[i]
        text += 'Question ' + str(i) + '(' + q[i]['type']+ ')' + '\n\n' + 'Attempt 1' + '\n\n'
        for option in range(len(q[i]['attempt1.correct'])):
            text += 'Option' + str(option + 1) + " (correct): " + q[i]['attempt1.correct'][option] + '\n\n'
        for option in range(len(q[i]['attempt1.wrong'])):
            text += 'Option' + str(option + 1 + len(q[i]['attempt1.correct'])) + " (incorrect): "  + q[i]['attempt1.wrong'][option] + '\n\n'
        
        text += 'Attempt 2 \n\n'
        
        for option in range(len(q[i]['attempt2.correct'])):
            text += 'Option' + str(option + 1) + " (correct): " + q[i]['attempt2.correct'][option] + '\n\n'
        for option in range(len(q[i]['attempt2.wrong'])):
            text += 'Option' + str(option + 1 + len(q[i]['attempt2.correct'])) + " (incorrect): "  + q[i]['attempt2.wrong'][option] + '\n\n'
            
        text += 'Option(s) kept for feedback : '
        
        for option in q[i]['feedback']:
            text += '(+)' + option + ' '
        text += ' \n\n'

    return text

In [10]:
#df_qns and df_ans, cosine_similarities, ans_dict and opt_dict  must be globally defined!
def main():
        
    questions = {}
    for index, row in df_qns.iterrows():
        idx = row['Question_id']
        new_type = choose_type(idx)
        if new_type == 'MC' and row['Question_type'] == 'MC':
            questions[idx] = select_options_mcmc(idx) 
            questions[idx]['type'] = new_type
        if new_type == 'SA' and row['Question_type'] == 'MC':
            questions[idx] = select_options_mcsa(idx)
            questions[idx]['type'] = new_type
        if new_type == 'MC' and row['Question_type'] == 'SA':
            questions[idx] = select_options_samc(idx)
            questions[idx]['type'] = new_type
        if new_type == 'SA' and row['Question_type'] == 'SA':
            questions[idx] = select_options_sasa(idx)
            questions[idx]['type'] = new_type
    return questions

## Entry point

In the block below
- regime:  development vs validation
- df_ans: must be dataset with answers
- df_qns: must be a dataset with questions

Run all the block and you will see what the algo selects

In [11]:
regime = 'development'
df_ans = pd.read_csv('Answers_data_prj3_update2.csv')
df_qns = pd.read_csv('Questions_data_prj3.csv')

#df_ans = pd.read_csv('ans_val.csv')
#df_qns = pd.read_csv('q_val.csv')


# The next line are necessary, because development and validations sets do not follow the same format!

if regime == 'development':
    df_qns.at[0, 'Question_type'] = 'MC' # This line must be applied to development set only!
    df_qns.at[1, 'Question_type'] = 'SA' #

if regime == 'validation':
    df_qns.at[0, 'Correct_answer_choice'] = 'A,C'
    
    for i in range(len(df_ans)):
        x = df_ans.at[i, 'Student_choice_on_question'].split(',')
        if(len(x) == 2):
            df_ans.at[i, 'Student_choice_on_question'] = x[0] + ', ' + x[1]
        if(len(x) == 3):
            df_ans.at[i, 'Student_choice_on_question'] = x[0] + ', ' + x[1]  + ', ' + x[2]

In [12]:
# MAKE SURE YOUR RUN THIS
allsentences = []
for i in range(len(df_qns)):
    allsentences += df_qns.iloc[i][['Choice_A_text', 'Choice_B_text', 'Choice_C_text', 'Choice_D_text']].tolist()
allsentences += df_ans['Answer_text'].tolist()

allsentences = [word_extraction(x) for x in allsentences]
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(allsentences)

ans_dict = {x : i for (i, x) in enumerate(allsentences)}

opt_dict = {x : i for (i, x) in enumerate(allsentences[: 4 * len(df_qns)])}

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(allsentences)

cosine_similarities = linear_kernel(tf_idf, tf_idf)

In [13]:
q = main()

In [14]:
print(print_questions(q))

Question 1(SA)

Attempt 1

Option1 (correct): Rajeev is getting a lot of practice, but he should ask an art instructor each day for feedback on how well he drew the day’s portrait

Option2 (correct): Despite doing so much practice, there is no one to give Rajeev feedback on his errors and remedial training based on that feedback. So while Rajeev is practicing a lot, he is not doing deliberate practice which focuses on correcting his errors and then improving his skill.

Option3 (incorrect): The task of creating a portrait is a complex one and should be studied step by step rather than in full. Hence, Rajeev should identify what parts of the portrait are problematic and practice them regularly instead of drawing new full portrait every day.

Option4 (incorrect): Rajeev is practicing different tasks every day, so his practice isn’t deliberate. He should try and draw the same portrait every day

Attempt 2 

Option1 (correct): He is not doing deliberate practice. B/c there is no feedback p

In [None]:
import pickle

#with open('dev.pickle', 'wb') as handle:
#   pickle.dump(q, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('dev.pickle', 'rb') as handle:
#    q = pickle.load(handle)