In [3]:
import os
import re
from time import time

import numpy as np
from numpy.linalg import norm
import pandas as pd
from sklearn.model_selection import train_test_split

from utils import GradeXML2DataFrame
from utils import get_hash
from utils import get_reference_answers
from utils import Featurizer


In [12]:
data = pd.read_csv(os.path.join("munge", "grade_data.csv"))
print('Found %d instances in modeling set.' %len(data))
data.head()

Found 898 instances in modeling set.


Unnamed: 0,instance_id,student_id,task_id,problem_description,question,answer,reference_answers,label
0,1,DTSU040,LP03_PR09.bLK.sh,"A car windshield collides with a mosquito, squ...",How does Newton's third law apply to this situ...,the windshield will apply a force to the mosqu...,1: Since the windshield exerts a force on the...,1
1,2,DTSU035,FM_LV04_PR05.sh,Two hockey players pass a puck between them on...,What forces are acting on the puck while the p...,The normal force coming from the ice and the g...,1: The forces acting on the puck while it is ...,0
2,3,DTSU021,FM_LVxx_PR01,A rocket pushes a meteor with constant force. ...,Can you articulate Newton's second law?,"if there is a zero net force on the object, th...",1: Newton's 2nd Law says that the net force i...,3
3,4,DTSU033,LP03_PR09.bLK.sh,"A car windshield collides with a mosquito, squ...",Can you articulate a principle or definition w...,An equal force always balancing it out regardl...,"1: For every action, there is an equal and op...",3
4,5,DTSU015,FM_LV04_PR05,Two hockey players pass a puck between them on...,"Based on Newton's first law, what can you say ...",The speed of the puck will equal to the net fo...,1: The puck will move in a straight line with...,3


In [1]:

def landmarks(reference_landmarks, student_landmarks):
    """Create a text file with word2vec embeddings of student
    and reference answers and their labels.

    Args:
        csv_data: path to csv file containing the instances
    """

    # Read a dataset containing all reference answers
    ra_data = pd.read_csv(reference_landmarks) 

    # Create hash keys for problem description and question
    ra_data['pd_hash'] = ra_data['problem_description'].apply(get_hash)
    ra_data['qu_hash'] = ra_data['question'].apply(get_hash)

    # Create a dataframe of reference answers one per row
    ra_data['ra_list'] = ra_data['reference_answers'].apply(get_reference_answers)
    landmarks_ra = ra_data[['pd_hash', 'qu_hash', 'label', 'ra_list']]
    landmarks_ra = landmarks_ra.explode('ra_list')
    landmarks_ra = landmarks_ra.rename(columns={'ra_list':'answer'})
    landmarks_ra['label'] = 0 # these are possible correct answers (class 0)
    landmarks_ra = landmarks_ra.drop_duplicates()
    print("Found {} distinct reference landmark answers."\
        .format(len(landmarks_ra)))

    # Create a dataframe of student answers
    sa_data = pd.read_csv(student_landmarks)
    
    # Create hash keys for problem description and question
    sa_data['pd_hash'] = sa_data['problem_description'].apply(get_hash)
    sa_data['qu_hash'] = sa_data['question'].apply(get_hash)

    landmarks_sa = sa_data[['pd_hash', 'qu_hash', 'label', 'answer']]
    landmarks_sa = landmarks_sa.drop_duplicates()
    print("Found {} distinct student landmark answers."\
        .format(len(landmarks_sa)))

    # Create the landmarks dataframe with distinct answers and their labels
    landmarks = landmarks_ra.append(landmarks_sa).drop_duplicates()

    # Create a featurizer object that converts a phrase into embedding vector
    emb_file = os.path.join("munge", "GoogleNews-vectors-negative300.bin")
    featurizer = Featurizer(emb_file)

    # Save the embeddings and labels to disk
    start = time()
    n_landmarks = 0
    with open(os.path.join("munge", "landmarks.txt"), 'w') as f:
        f.write('pd_hash\tqu_hash\tlabel\tanswer\tembedding\n')
        for i in range(len(landmarks)):
            pd_hash = landmarks.iloc[i]['pd_hash']
            qu_hash = landmarks.iloc[i]['qu_hash']
            label = landmarks.iloc[i]['label']
            answer = landmarks.iloc[i]['answer']
            emb = featurizer.doc2vec(landmarks.iloc[i]['answer'])
            emb_txt = ','.join(map(str, emb))
            if norm(emb) != 0:
                n_landmarks += 1
                f.write("%s\t%s\t%s\t%s\t%s\n"\
                    %(pd_hash, qu_hash, label, answer, emb_txt))
    print('Generating landmark embeddings took %.2f seconds.'\
        %(time() - start))
    print("Found {} non zero landmarks in total.".format(n_landmarks))



In [None]:
landmarks(os.path.join("munge",  "modeling.csv"))