Script serves two purposes:
Curates data for 3d grader
Builds models for grading EiPE answers (Data is 3D)

In [47]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.mixture import GaussianMixture
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay


nltk.download('punkt')
nltk.download('words')

from nltk.corpus import words




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chine\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [48]:
data_file = "3d_label_fullset_merged.csv"
alldata_df = pd.read_csv(data_file, encoding="utf-8")

In [49]:
%%capture

#generate the final labels for the 3d grader - currently done based on majority grade
unambig = []
corr = []
high_lvl = []
majority_3d_labels = []

def get_final_vote(person1label, person2label, person3label):
    labels = [person1label, person2label, person3label]
    num_zero_labels = labels.count(0)
    num_one_labels = labels.count(1)
    
    if num_zero_labels > num_one_labels:
        return 0
    elif num_one_labels > num_zero_labels:
        return 1
    else:
        return -1
        

        

for index, row in alldata_df.iterrows():
    
    majority_unambig = get_final_vote(row["max_una"], row["chinny_una"], row["binglin_una"])
    majority_correct = get_final_vote(row["max_c"], row["chinny_c"], row["binglin_c"])
    majority_highlvl = get_final_vote(row["max_hl"], row["chinny_hl"], row["binglin_hl"])
    
    if majority_unambig == -1 or majority_correct == -1 or majority_highlvl == -1: #something wrong with data
        print("Could not reconcile row {}".format(index))
    else:
        majority_3d_labels.append((majority_unambig, majority_correct, majority_highlvl)) #store the data in other format
    
    unambig.append(majority_unambig)
    corr.append(majority_correct)
    high_lvl.append(majority_highlvl)
    
    
    


alldata_df["majority_una"] = pd.Series(unambig)
alldata_df["majority_c"]   = pd.Series(corr)
alldata_df["majority_hl"]  = pd.Series(high_lvl)


qids = alldata_df["qid"].unique()

alldata_df.to_csv(path_or_buf = "majority_vote_added.csv", index = False)


In [50]:
#determine the amount of agreement between the human graders

at_least_one_agrees = 0
at_least_two_agree = 0
at_least_three_agree = 0

num_rows_total = 0

        

for index, row in alldata_df.iterrows():
    
    num_rows_total += 1
    
    max_labels = (row["max_una"], row["max_c"], row["max_hl"])
    chinny_labels = (row["chinny_una"], row["chinny_c"], row["chinny_hl"])
    binglin_labels = (row["binglin_una"], row["binglin_c"], row["binglin_hl"])
    
    all_human_labels = [max_labels, chinny_labels, binglin_labels]
    cur_majority_vote = (row["majority_una"], row["majority_c"], row["majority_hl"])
    
    if cur_majority_vote in all_human_labels:
        at_least_one_agrees += 1
        all_human_labels.remove(cur_majority_vote) #removing so we can see if it still exists, i.e. if at least 2 occur.
        
    if cur_majority_vote in all_human_labels:
        at_least_two_agree += 1
        all_human_labels.remove(cur_majority_vote)
        
    if cur_majority_vote in all_human_labels:
        at_least_three_agree += 1
        


print("At least one human grader matched the majority vote {}% of the time".format(100 * round(at_least_one_agrees/num_rows_total, 3)))
print("At least two human graders matched the majority vote {}% of the time".format(100 * round(at_least_two_agree/num_rows_total, 3)))
print("All three human graders matched the majority vote {}% of the time".format(100 * round(at_least_three_agree/num_rows_total, 3)))

        
        

At least one human grader matched the majority vote 99.5% of the time
At least two human graders matched the majority vote 88.3% of the time
All three human graders matched the majority vote 45.7% of the time


In [4]:
buckets = [(0,0,0), (0,0,1), (0,1,0), (0,1,1), (1,0,0), (1,1,0), (1,0,1), (1,1,1)]
lendata = len(majority_3d_labels)


print("3D groupings = (unambig, correct, highlevel)\n\n")
print("3D label,  Freq. (Num Times Occur)")


for curbucket in buckets:
    curcount = majority_3d_labels.count(curbucket) #get the number of occurrences of a particular labeling configuration within the data
    print("{}  {:>5.2f}% ({})".format(curbucket, round(100*curcount/lendata, 2), curcount)) #print group name, group freq and raw count of group occur.



    
def display_class_dist(given_df):
    '''
    Display the distribution of data for each qid. There are 8 possible bins.
    '''
    classes =  {(0,0,0): 0, (0,0,1):0, (0,1,0):0, (0,1,1):0, 
                (1,0,0):0, (1,1,0):0, (1,0,1):0, (1,1,1):0}
    
    ordering = [(0,0,0), (0,0,1), (0,1,0), (0,1,1), (1,0,0), (1,1,0), (1,0,1), (1,1,1)]
    
    print("3D groupings = (unambig, correct, highlevel)\n\n")
    print("3D label,  Freq. (Num Times Occur)")
    
    numrowsdata = 0
    
    for index, row in practicequiz_df.iterrows():
        
        cur_unambig = row["majority_una"]
        cur_correct = row["majority_c"]
        cur_hl      = row["majority_hl"]
        
        classes[(cur_unambig, cur_correct, cur_hl)] += 1
        numrowsdata += 1
    
    for curbin in ordering:
        curbincount = classes[curbin] #get the number of occurrences of a particular labeling configuration within the data
        print("{}  {:>5.2f}% ({})".format(curbincount, round(100*curbincount/numsrowsdata, 2), curbincount)) #print group name, group freq and raw count of group occur.


    
        
        


3D groupings = (unambig, correct, highlevel)


3D label,  Freq. (Num Times Occur)
(0, 0, 0)   5.76% (157)
(0, 0, 1)  13.00% (354)
(0, 1, 0)   1.25% (34)
(0, 1, 1)   9.36% (255)
(1, 0, 0)   4.66% (127)
(1, 1, 0)   6.61% (180)
(1, 0, 1)  20.56% (560)
(1, 1, 1)  38.80% (1057)


In [5]:
#preview of the data
alldata_df.head()

Unnamed: 0,pl_qid,qid,code,assumption,example_correct_answers,response,max_una,max_c,max_hl,chinny_una,chinny_c,chinny_hl,binglin_una,binglin_c,binglin_hl,majority_una,majority_c,majority_hl
0,cdrd_exam2_manual/cdrd_exam2_1,return the location of an element if it is in ...,"def f(x, y):\n if y in x:\n return x...",Assume that variable x is a list of strings an...,['Return the position of a given element in a ...,returns the index of a string in a list if it ...,0,1,1,0,1,1.0,1,0,1,0,1,1
1,cdrd_exam1_manual/cdrd_exam1_combined,print_items_out_smaller_then_larger,"def f(x, y):\n if x < y:\n print(x, ...",Assume that the variables x and y are integers.,['prints two given numbers in numberical order...,Prints lesser number then greater number,0,1,1,1,1,1.0,1,1,1,1,1,1
2,cdrd_final_manual/cdrd_final_1,find_largest_number_in_file,def f(x):\n l = open(x).readlines()\n n ...,Assume that the variable x is a string contain...,['Return the largest number from a given file'...,returns n if less than or equal to i,1,0,0,1,0,0.0,0,0,0,1,0,0
3,cdrd_final_manual/cdrd_final_2,a33_replace_all_ys_with_zs,"def f(x, y, z):\n for i in range(len(x)):\n...",Assume that the variable x is a list of intege...,['Replace every element equal to y in the give...,"if a value in list x is equal to y, value is c...",0,1,1,1,1,1.0,0,1,1,0,1,1
4,cdrd_exam2_manual/cdrd_exam2_2,increase_all_numbers_by_y,"def f(x,y):\n for k in range(len(x)):\n ...",Assume that the variable x is a list of number...,"['Given a list and a number, increase every el...",add y to each number of a list,1,1,1,1,1,1.0,1,1,1,1,1,1


In [6]:
stemmer = PorterStemmer()

def preprocess_text(studentresponse):    
    words = word_tokenize(studentresponse)
    stemmed_words = [stemmer.stem(word) for word in words]
    lower_words= [word.lower() for word in stemmed_words] #convert to lowercase
    stemmed_words = lower_words
    regular_alpha = [] 
    for word in stemmed_words:
        if word.isalnum():  #remove any word that isn't an alphabet or a number.
            regular_alpha.append(word)
    stemmed_words = regular_alpha
    
    #TODO - need to handle stop words by passing in my own list. (related to tf-idf weighting)
    
    return " ".join(stemmed_words)

In [37]:
def plot_confusion_matrix(confusion_matrix, what_to_predict, accuracy ):
        '''
        Function to help with plotting a confusion matrix
        '''
        plt.figure(figsize=(9,9))
        sns.heatmap(confusion_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
        plt.ylabel('Actual label');
        plt.xlabel('Predicted label');
        all_sample_title = 'Accuracy Score for {thingtopredict}: {accscore:.2f}'.format(thingtopredict = what_to_predict,
                                                                                    accscore = accuracy)
        plt.title(all_sample_title, size = 15);
        

def create_input_representation(given_df, pct_for_train = 0.8, display_model_info = False):
    numsamples = 0
    processed_studentanswers = []
    
    for studentanswer in given_df["response"]:
    
        processed_answer = preprocess_text(studentanswer)
        processed_studentanswers.append(processed_answer)
        numsamples +=  1
    
    vectorizer = CountVectorizer(ngram_range = (2,2), min_df = 2) 
    
    #fit only on training data to prevent overfit
    vec_fitter = vectorizer.fit(processed_studentanswers[:int(numsamples *  pct_for_train)])
    bag_of_words_and_bigrams = vec_fitter.transform(processed_studentanswers)
    
    if display_model_info:
        print("Unique words in vocab = ", vectorizer.get_feature_names())
        print("\n\nShape of processed input vector = ", bag_of_words_and_bigrams.shape)
        print("\n\nVocab and indices = ", vectorizer.vocabulary_)#indices of each word
        
    return bag_of_words_and_bigrams.toarray()

def split_data_and_train(given_Xdf, given_ydf, given_train_pct, qname = ""):
    
    num_set_samples = given_Xdf.shape[0]
    divider = int(num_set_samples * given_train_pct)
    
    X_train = given_Xdf[0: divider, ]
    y_train = given_ydf[0: divider, ]
    
    X_test = given_Xdf[divider: , ]
    y_test = given_ydf[divider: , ]
    
    
    labels_order = ["Unambig", "Correct", "High level"]
    accuracy_l = []
    #NOTE: Order for data is [unambig, correct, high level]
    for i in range(3):
       
        # using standard log. regression model
        model = LogisticRegression(random_state=0)
        model = model.fit(X_train, y_train[:,i]) #train only one column at a time
    

        binary_y_predictions =  model.predict(X_test)
        #use the withheld set for testing the model
        print("{} Log reg. model performance on the withheld test set:\n\n".format(labels_order[i]))
        
        #for the y_test data (the true labels for column i), check if the predictions match
        dim_accuracy = metrics.accuracy_score(y_test[:, i], binary_y_predictions)
        print("Accuracy is = ", dim_accuracy)
        accuracy_l.append(dim_accuracy)
        #print(metrics.classification_report(y_test, binary_y_predictions))


        print("\n\nConfusion Matrix:")
        
        confusion_matrix = metrics.confusion_matrix(y_test[:, i],binary_y_predictions, normalize="true")
        print(confusion_matrix)
        
        #uncomment next line to print the confusion matrix
        #plot_confusion_matrix(confusion_matrix, labels_order[i] + qname,  dim_accuracy)
        print("\n\n")
        
    return accuracy_l #accuracy for the three dimensions for a given problem

        


In [39]:
#for each question, create the input representation
#train various models, bigrams, bigrams + distance_from_golden, report on performance

train_pct = 0.8
qids_accuracy = []

for cur_qid in qids:
    
    cur_question_df = alldata_df[alldata_df["qid"] == cur_qid]
    print("QID = \'{}.\' Amount of labeled data = {} samples\n\n".format(cur_qid, cur_question_df.shape[0]))
    X = create_input_representation(cur_question_df, pct_for_train = train_pct, display_model_info = False)
    y = cur_question_df[["majority_una", "majority_c", "majority_hl"]]
    y = y.to_numpy(dtype=int)

    qname_formatted = ": " + cur_qid #format cur_qid name so it looks nice when printed with other function
    cur_qid_accuracy = split_data_and_train(X, y, train_pct, qname = qname_formatted) 
    qids_accuracy.append(cur_qid_accuracy)
    print("\n\n")


  

QID = 'return the location of an element if it is in the list.' Amount of labeled data = 295 samples


Unambig Log reg. model performance on the withheld test set:


Accuracy is =  0.5932203389830508


Confusion Matrix:
[[0.04545455 0.95454545]
 [0.08108108 0.91891892]]



Correct Log reg. model performance on the withheld test set:


Accuracy is =  0.864406779661017


Confusion Matrix:
[[0.85185185 0.14814815]
 [0.125      0.875     ]]



High level Log reg. model performance on the withheld test set:


Accuracy is =  1.0


Confusion Matrix:
[[1.]]






QID = 'print_items_out_smaller_then_larger.' Amount of labeled data = 466 samples


Unambig Log reg. model performance on the withheld test set:


Accuracy is =  0.6276595744680851


Confusion Matrix:
[[0.14814815 0.85185185]
 [0.17910448 0.82089552]]



Correct Log reg. model performance on the withheld test set:


Accuracy is =  0.723404255319149


Confusion Matrix:
[[0.42424242 0.57575758]
 [0.1147541  0.8852459 ]]



High level Lo

Unambig Log reg. model performance on the withheld test set:


Accuracy is =  0.6666666666666666


Confusion Matrix:
[[0.25  0.75 ]
 [0.125 0.875]]



Correct Log reg. model performance on the withheld test set:


Accuracy is =  0.75


Confusion Matrix:
[[0.8        0.2       ]
 [0.28571429 0.71428571]]



High level Log reg. model performance on the withheld test set:


Accuracy is =  0.9166666666666666


Confusion Matrix:
[[0.66666667 0.33333333]
 [0.         1.        ]]






QID = 'is_even.' Amount of labeled data = 48 samples


Unambig Log reg. model performance on the withheld test set:


Accuracy is =  0.6


Confusion Matrix:
[[0.   1.  ]
 [0.25 0.75]]



Correct Log reg. model performance on the withheld test set:


Accuracy is =  0.8


Confusion Matrix:
[[0.66666667 0.33333333]
 [0.14285714 0.85714286]]



High level Log reg. model performance on the withheld test set:


Accuracy is =  0.9


Confusion Matrix:
[[0. 1.]
 [0. 1.]]








In [41]:
#show the accuracy of the classifier for each column across all data



qids_accuracy = np.array(qids_accuracy)
print("Averaging results for all qids in each of the respective dimensions\n")

avg_unambig_accuracy = np.mean(qids_accuracy[:, 0])
print("Avg accuracy for Unambig: {:.2f}".format(avg_unambig_accuracy))


avg_corr_accuracy = np.mean(qids_accuracy[:, 1])
print("Avg accuracy for Correct column: {:.2f}".format(avg_corr_accuracy))



avg_highlvl_accuracy = np.mean(qids_accuracy[:, 2])
print("Avg accuracy for High-level column: {:.2f}\n\n".format(avg_highlvl_accuracy))


avg_classifier_perf_overall = np.mean(qids_accuracy)
print("Average results across all dimensions for all qids: {:.2f}".format(avg_classifier_perf_overall))

Averaging results for all qids in each of the respective dimensions

Avg accuracy for Unambig: 0.69
Avg accuracy for Correct column: 0.84
Avg accuracy for High-level column: 0.89


Average results across all dimensions for all qids: 0.81
