In [1]:
import nltk
import glob
import plistlib
from typing import Union, List, Dict, Tuple, Iterable
import pandas as pd

In [2]:
# Download punkt and pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/jackwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jackwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# A function that read a path of *.plist file and return a list of dictionaries and the name of the file without the extension
def read_plist(path: str) -> Tuple[List[Dict], str]:
    with open(path, 'rb') as f:
        data = plistlib.load(f)
    return data, path.split('/')[-1].split('.')[0]

In [4]:
# Read all *.plist files under Result and create a generator that iter over all files
file_list = glob.glob('Result/*.plist')
file_generator = map(read_plist, file_list)

In [5]:
# Generate a sample
it = iter(file_generator)

In [6]:
# create a dict with key as passage id and value as passage using information in the csv files
def create_passage_dict(csv_path):
    df = pd.read_csv(csv_path)
    passage_dict = {row['passage_id'] : row['passage'] for _, row in df.iterrows()}
    return passage_dict

In [7]:
passage_dict = create_passage_dict("yrs12_passages.csv")

In [8]:
# create a function that parse a string to a dict with sample
# the string has schema student_{student_id}_passage_{passage_id}_{random_number}
def parse_files_name(string):
    student_id = string.split('_')[1]
    passage_id = int(string.split('_')[3])
    random_number = string.split('_')[4]
    return {'student_id': student_id, 'passage_id': passage_id, 'random_number': random_number}

In [9]:
# a function read a string defined in schema student_{student_id}_passage_{passage_id}_{random_number} and return tokenized text remove stop words and punctuation
def read_and_tokenize_file(file_name):
    idx = parse_files_name(file_name)['passage_id']
    textx = passage_dict[idx]
    tokens = nltk.word_tokenize(textx)
    return [{"tString" : token.lower(), "tConfidence": -1} for token in tokens if token.isalpha()]

In [10]:
# A function that input an iterator return the first item that is no error
# the iterator contains a tuple of (data, file_name)
# map the file name with to a passage using read_and_tokenize_file
# return a tuple of ((tokenized_text, data), file_name)
# if there is a KeyError, it will return the next item
def get_next_item(it):
    while True:
        try:
            data, file_name = next(it)
            tokenized_text = read_and_tokenize_file(file_name)
            return (tokenized_text, data), file_name
        except KeyError:
            continue

In [11]:
#A function take two lists of words in sequence as input and return using Longest Common Subsequence algorithm
def lcss(ref_text:List[Dict], student_input:List[Dict]) -> List[Dict]:
    lengths = [[0 for j in range(len(student_input) + 1)] for i in range(len(ref_text) + 1)]
    # row 0 and column 0 are initialized to 0 already
    for i, rec_x in enumerate(ref_text):
        x = rec_x['tString']
        for j, rec_y in enumerate(student_input):
            y = rec_y['tString'].lower()
            if x == y and rec_y['tConfidence'] >= 0.01:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = []
    x, y = len(ref_text), len(student_input)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            result.append((ref_text[x - 1], "R"))
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            result.append((student_input[y - 1], "A"))
            y -= 1
        else:
            assert ref_text[x - 1]['tString'] == student_input[y - 1]['tString'].lower()
            result.append((student_input[y - 1], "M"))
            x -= 1
            y -= 1
    return result[::-1]

In [12]:
common = lcss(*get_next_item(it)[0])

In [13]:
# A function that input a list of tuple of (word, confidence, status) and return a string of word with status
# return confidence with 2 digits if the status is M otherwise add the status as a tag at the end of the word
def get_string_with_status(lcs_matching_result):
    common_tuple = [(x['tString'], x['tConfidence'], c) for x, c in lcs_matching_result]
    return ' '.join([f'{word}<{status}>' if status != 'M' else f'{word}<{confidence:.2f}>' for word, confidence, status in common_tuple])

In [14]:
get_string_with_status(common)

'monte<R> loved<0.17> and<A> cooking<0.59> for<0.67> people<0.69> his<0.65> favorite<0.64> thing<0.69> to<R> make<R> was<0.83> tacos<0.86> and<0.86> refried<0.85> beans<0.89> then<A> when<R> he<0.86> made<0.85> them<0.87> and<A> he<0.86> felt<0.85> like<0.81> a<0.46> master<0.80> chief<A> shift<A> chef<R>'

In [15]:
common

[({'tString': 'monte', 'tConfidence': -1}, 'R'),
 ({'tConfidence': 0.16599999368190765,
   'tDuration': 0.48,
   'tString': 'loved',
   'tTime': 3.33},
  'M'),
 ({'tConfidence': 0.014999999664723873,
   'tDuration': 0.1499999999999999,
   'tString': 'and',
   'tTime': 3.81},
  'A'),
 ({'tConfidence': 0.5920000076293945,
   'tDuration': 0.6500000000000004,
   'tString': 'cooking',
   'tTime': 3.96},
  'M'),
 ({'tConfidence': 0.6660000085830688,
   'tDuration': 0.25,
   'tString': 'for',
   'tTime': 4.61},
  'M'),
 ({'tConfidence': 0.6919999718666077,
   'tDuration': 0.5699999999999994,
   'tString': 'people',
   'tTime': 4.86},
  'M'),
 ({'tConfidence': 0.652999997138977,
   'tDuration': 0.33000000000000007,
   'tString': 'his',
   'tTime': 5.55},
  'M'),
 ({'tConfidence': 0.6389999985694885,
   'tDuration': 0.4500000000000002,
   'tString': 'favorite',
   'tTime': 5.88},
  'M'),
 ({'tConfidence': 0.6890000104904175,
   'tDuration': 0.2999999999999998,
   'tString': 'thing',
   'tTime':

In [16]:
# iterate over all files in file_list and save the result by file name under a new folder Match with extension .txt and content is get_string_with_status(common_tuple)
# if there is a KeyError, it will skip the file
# overwrite the file if it already exists

def save_match_result(file_list):
    for file_path in file_list:
        try:
            data, file_name = read_plist(file_path)
            tokenized_text = read_and_tokenize_file(file_name)
            common = lcss(tokenized_text, data)
            with open(f'Match/{file_name}.txt', 'w') as f:
                f.write(get_string_with_status(common))
        except KeyError:
            continue

In [17]:
save_match_result(file_list)

In [18]:
# iterate over all files in file_list and save the result by csv table with index as file name and content is get_string_with_status(common_tuple)
# if there is a KeyError, it will skip the file
# sort the table by file name
# compress with gzip and overwrite the file if it already exists
def save_result_csv(file_list):
    result = []
    for file_path in file_list:
        try:
            data, file_name = read_plist(file_path)
            tokenized_text = read_and_tokenize_file(file_name)
            common = lcss(tokenized_text, data)
            result.append((file_name, get_string_with_status(common)))
        except KeyError:
            continue
    df = pd.DataFrame(result, columns=['file_name', 'result'])
    df = df.sort_values(by=['file_name'])
    df.to_csv('result.csv.gz', index=False, compression='gzip')

In [19]:
save_result_csv(file_list)