In [1]:
import sequence_matching
import pandas as pd
import glob
import nltk
from typing import List, Dict, Tuple, Iterator, Iterable, Set

In [2]:


# Download punkt and pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/jackwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jackwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Create dict for reference text
passage_dict = sequence_matching.create_passage_dict("../DataFolder/Original_CSV/yrs12_passages.csv")

In [4]:
# Read all *.plist files under Result and create a generator that iter over all files
file_list = glob.glob('Result/*.plist')
file_generator = map(sequence_matching.read_plist, file_list)

In [5]:
# a function that input a list of tuple with(word info, status) and return a c_w_p_m
# return 0 if the list is empty
def get_c_w_p_m(common: List[Tuple[Dict, str]]) -> Tuple[float, int, float]:
    list_correct_word = [x for x, c in common if c == 'M']
    if len(list_correct_word) == 0:
        return 0., 0, 0.
    else:
        list_correct_word.sort(key=lambda x: x['tTime'])
        time_diff = (list_correct_word[-1]['tTime'] + list_correct_word[-1]['tDuration'] - list_correct_word[0]['tTime']) / 60
        total_word = len(list_correct_word)
        c_w_p_m = total_word / time_diff
    return time_diff, total_word, c_w_p_m

In [6]:
# a function that input a list of all files path
# first read the data and filename using read_plist
# parse the filename to get passage id then get related passage using passage id
# match the reference text with the student text using lcs
# if the KeyError is raised, it goes to the next file
# save the result to a csv file
# first column is the file name, second column is the is time_diff third is total word last is  c_w_p_m
# return the dataframe
def get_c_w_p_m_for_all_files(file_list: Iterable[str], passage_dict: Dict[str, str], prob: float = 0.1, stemmer: nltk.stem.api = None, path : str = 'c_w_p_m.csv') -> Tuple[pd.DataFrame, Set[str]]:
    result = []
    errors_list = set()
    for file in file_list:
        try:
            data, filename = sequence_matching.read_plist(file)
            cleaned_filename = filename.split('_')
            cleaned_filename[3] = str(int(cleaned_filename[3]) % 100000)
            cleaned_filename = '_'.join(cleaned_filename)
            tokenized_text = sequence_matching.read_and_tokenize_file(cleaned_filename, passage_dict)
            common = sequence_matching.lcss(tokenized_text, data, prob = prob, stemmer = stemmer)
            time_diff, total_word, c_w_p_m = get_c_w_p_m(common)
            result.append([cleaned_filename, time_diff, total_word, c_w_p_m])
        except KeyError as e:
            errors_list.add(e.args[0])
            continue
    df = pd.DataFrame(result, columns=['file', 'time_diff', 'total_word', 'c_w_p_m'])
    df.to_csv(path, index=False)
    return df, errors_list



In [7]:
df, err1 = get_c_w_p_m_for_all_files(file_list, passage_dict, prob = 0.1, stemmer = nltk.stem.LancasterStemmer(), path = 'c_w_p_m_stemmed.csv')
df2, err2 = get_c_w_p_m_for_all_files(file_list, passage_dict, prob = 0.1, stemmer = None, path = 'c_w_p_m.csv')

In [8]:
# data, fname= sequence_matching.read_plist('Result/student_1_passage_24000_56df1ab8df17d.wav.plist')
# tokenized_text = sequence_matching.read_and_tokenize_file(fname, passage_dict)
# result_tuple = sequence_matching.lcss(tokenized_text, data, prob = 0.1, stemmer = nltk.stem.LancasterStemmer())
# list_correct_word = [x for x, c in result_tuple if c == 'M']
# len(list_correct_word)
# get_c_w_p_m(result_tuple)
# list_correct_word
# len(tokenized_text)

In [9]:
print(err1)

set()


In [10]:
'-'.join(['1', '2', '3'])

'1-2-3'