# mor-annotation to construction types

Intended functionality:
- read dataframe with tidied CHILDES data
- determine construction types on basis of morphological annotation (mor-tier)
- add first one/two/three lemmas of every utterance for later analysis of lexical specificity

In [None]:
import os, os.path, re
import pandas as pd 

# Load transcription df

In [None]:
df = pd.read_csv('manual-anno.csv', sep="\t", index_col=0, na_filter = False)

In [None]:
df

# Construction types

[Cameron-Faulkner et al. (2003)](https://www.eva.mpg.de/documents/Wiley-Blackwell/Cameron-Faulkner_Construction_CogScience_2003_1555820.pdf) define the following construction categories:
- fragments (utterances without subject and predicate)
    - x(fow) one word 
    - x(fnp) noun phrase
    - x(fvp) verb phrase
    - x(fpp) prepositional phrase
    - (fmw) other multi-word
- questions
    - x(qwh) wh-questions
    - x(qyn) yes/no-questions
- (imp) imperatives
- x(cop) copula
- subject-predicate
    - x(spt) transitive
    - (spi) intransitive
    - (spo) other
- x(com) complex (two lexical verbs)

### Construction finder

based on debugger and its sub-functions

In [None]:
def find_cxn(mor_line, clean_line):
    mor_line = mor_line.replace("adv|","adve|")
    pos_line = mor_line.replace("~"," ")
    pos_list = pos_line.split()
    clean_pos_list = []
    for element in pos_list:
        head, sep, tail = element.partition('|')
        clean_pos_list.append(head)
    if match_frp(clean_line):
        return("frp")
    elif match_ffi(clean_line):
        return("ffi")
    elif match_fow(mor_line, clean_pos_list):
        return("fow")
    #elif mor_line.count("?") > 0:
    elif mor_line[-1] == "?":
        if match_qwh(mor_line, clean_pos_list):
            return("qwh")
        else:
            return(match_qyn(mor_line, clean_pos_list))
    elif match_cop(mor_line, clean_pos_list):
        return("cop")
    elif match_com2(mor_line, clean_pos_list):
        return("com")
    elif match_spt(mor_line, clean_pos_list):
        return("spt")
    elif match_spi(mor_line, clean_pos_list):
        return("spi")
    elif match_imp(mor_line, clean_pos_list):
        return("imp")
    elif match_fvp(mor_line, clean_pos_list):
        return("fvp")
    elif match_fnp(mor_line, clean_pos_list):
        return("fnp")
    elif match_fpp(mor_line, clean_pos_list):
        return("fpp")
    elif match_fom(mor_line, clean_pos_list):
        return("fom")
    else:
        return("NA")

### Construction debugger regex:

In [None]:
def find_cxn_debug(mor_line, clean_line):
    mor_line = mor_line.replace("adv|","adve|")
    pos_line = mor_line.replace("~"," ")
    pos_list = pos_line.split()
    clean_pos_list = []
    for element in pos_list:
        head, sep, tail = element.partition('|')
        clean_pos_list.append(head)
    #print(clean_pos_list)
    candidates = []
    # look at all sentences ending in "?" --> question
    # begins with interrogative pronoun (optional preposition) --> qwh
    # begins with aux/mod/cop (optional ???) --> qyn
    # is there a subject, no main verb, but a copula --> cop
    # does the sentence begin with a main verb and has no subject in front? --> imp
    # are there two or more lexical verbs with something subject-y? --> com
    # is there one lexical verb with something subject-y? --> subject-predicate
    # something object-like after the verb? --> spt
    # nothing object-like after the verb? --> spi
    # only one word --> fow
    # something noun-y without predicate? --> fnp
    # something verb-y without subject? --> fvp
    # nothing verb-y and utterance begins with preposition --> fpp
    # multi-word, but weird repetition etc. --> fom
    # empty mor tier --> NA
    candidates.append(match_frp(clean_line))
    candidates.append(match_ffi(clean_line))
    candidates.append(match_fow(mor_line, clean_pos_list))
    candidates.append(match_fnp(mor_line, clean_pos_list))
    candidates.append(match_fvp(mor_line, clean_pos_list))
    candidates.append(match_fpp(mor_line, clean_pos_list))
    candidates.append(match_fom(mor_line, clean_pos_list))
    candidates.append(match_qwh(mor_line, clean_pos_list))
    candidates.append(match_qyn(mor_line, clean_pos_list))
    candidates.append(match_imp(mor_line, clean_pos_list))
    candidates.append(match_cop(mor_line, clean_pos_list))
    candidates.append(match_spt(mor_line, clean_pos_list))
    candidates.append(match_spi(mor_line, clean_pos_list))
    candidates.append(match_spo(mor_line, clean_pos_list))
    candidates.append(match_com(mor_line, clean_pos_list))
    candidates.append(match_NA(mor_line, clean_pos_list))
    return candidates

In [None]:
response_particles = ["yes","no","yeah","yup","yip","nah"]
punc = [".","?","!"]
def match_frp(clean_line):
    for item in punc:
        if item in clean_line:
            clean_line = clean_line.strip(item)
            clean_line = clean_line.strip()
    if clean_line in response_particles:
        return("frp")

In [None]:
formulaic_interjections = ["ok", "okay", "okey dokey", "okey", "please", "thank you", "thanks", "hello", "hi", "hiya", "goodbye", "good bye", "bye", "byebye", "bye-bye", "bye bye"]
punc = [".","?","!"]
def match_ffi(clean_line):    
    for item in punc:
        if item in clean_line:
            clean_line = clean_line.strip(item)
            clean_line = clean_line.strip()
    if clean_line in formulaic_interjections:
        return("ffi")

In [None]:
def match_fow(line, clean_pos_list):
    if line.count("|") == 1:
        return("fow")

In [None]:
def match_fnp(line, clean_pos_list):
    np_count = line.count("n|") + line.count("n:prop|") + line.count("pro:per|") + line.count("pro:indef|") + line.count("n:let") + line.count("on|")
    if np_count > 0:
        return("fnp")

In [None]:
def match_fvp(line, clean_pos_list):
    v_count = line.count("v|") + line.count("aux|") + line.count("cop|") + line.count("mod|") - line.count("adv|")
    if v_count > 0:
        return("fvp")

In [None]:
def match_fpp(line, clean_pos_list):
    if line.count("prep|") > 0:
        return("fpp")

In [None]:
def match_fom(line, clean_pos_list):
    if len(clean_pos_list) > 1:
        return("fom")

In [None]:
def match_qwh(line, clean_pos_list):
    x = re.findall("((co)(\|\w*|\|\w*)( |\~))*(pro:int|det:int|pro:rel|conj)(\|\w*|\|\w*)( |\~)((n|adj|pro:per)(\|\w*|\|\w*)( |\~))*(mod|aux|cop|v)(\|\w*|\|\w*)", line)
    if x:
        if clean_pos_list[0] not in ["mod","aux","cop"]:
            return("qwh")

In [None]:
def match_qyn(line, clean_pos_list):
    if clean_pos_list[0] not in ["pro:int","det:int","pro:rel"]:
        return("qyn")
    else:
        return("qwh")

In [None]:
def match_imp(line, clean_pos_list):
    x = re.findall("(v|mod)(\|\w*|\|\w*)(\&\w*)*( |\~)((qn)(\|\w*|\|\w*)( |\~))*(adve|pro:\d*|prep|det|dia)", line)
    if x:
        return("imp")

In [None]:
def match_cop(line, clean_pos_list):
    num_v = line.count("v|")
    num_adv = line.count("adv|")
    num_v = num_v - num_adv
    num_be = line.count("cop|be")
    if num_be >= 1 and num_v < 1:
        if line[0:4] != "cop|":
            return("cop")

In [None]:
def match_spt(line, clean_pos_list):
    joined_pos = ' '.join(clean_pos_list)
    #print(joined_pos)
    x = re.findall("(pro:per|pro:sub|n|pro:int|n:prop|det:dem) (.)*(v|mod|aux|co) (.)*(pro:per|pro:obj|n |pro:dem|dia|pro:sub|n:prop|pro:indef|pro:refl)", joined_pos)
    if x:
        if line[0:2] != "v|":
            return("spt")

In [None]:
def match_spi(line, clean_pos_list):
    x = re.findall("(pro:per|pro:sub|n|pro:int|n:prop|det:dem)(\|\w*|\|\w*)(\&\w*)*( |\~)(v|cop|mod|aux)(\|\w*|\|\w*)(\&\w*)*( |\~)", line)
    if x:
        if line[0:2] != "v|":
            return("spi")

In [None]:
def match_spo(line, clean_pos_list):
    if True == True:
        return("spo")

In [None]:
def match_com(line, clean_pos_list):
    num_v = line.count("v|")
    num_adv = line.count("adv|")
    num_v = num_v - num_adv
    num_part = line.count("part|")
    num_v = num_v + num_part
    if num_v > 1:
        return("com")

In [None]:
def match_com2(line, clean_pos_list):
    x = re.findall("(pro:per|pro:obj|n |pro:dem|dia|pro:sub|n:prop|pro:indef|pro:refl)(\|\w*|\|\w*)(\&\w*)*( |\~)(v|mod|aux)(\|\w*|\|\w*)(\&\w*)*( |\~)", line)
    if len(x) > 1:
        return("com")
    else:
        num_v = line.count("v|")
        num_adv = line.count("adv|")
        num_v = num_v - num_adv
        num_part = line.count("part|")
        num_v = num_v + num_part
        if num_v > 2:
            return("com") 
    y = re.findall("(conj\|if|comp|if) (pro:dem|pro:per|pro:sub)", line)
    if y:
        return("com")

In [None]:
def match_NA(line, clean_pos_list):
    if True == True:
        return("NA")

## Add CXNs

In [None]:
def add_cxns(dataframe):
    parses = []
    for index, row in dataframe.iterrows():
        if row['speaker'] != 'CHI':
            parse = find_cxn(row['mor_utterance'], row['clean_utterance'])
            parses.append(parse)
        else:
            parses.append("NA")
    return parses

### Test rules:

In [None]:
def eval_cxns(dataframe):
    count = 0
    right = 0
    wrong = 0
    parses = []
    for index, row in dataframe.iterrows():
        if row['speaker'] != 'CHI':
            count = count + 1
            anno = row['cxn_manual']
            parse = find_cxn(row['mor_utterance'], row['clean_utterance'])
            parses.append(parse)
            if anno == parse:
                right = right + 1
                #print(row['clean_utterance'])
                #print("Manual anno: " + anno)
                #print(parse)
            else:
                wrong = wrong + 1
                if anno and parse != None:
                    print(row['clean_utterance'])
                    print("Manual anno: " + anno)
                    print(parse)
        else:
            parses.append("NA")
    print("For "+str(count)+" sentences, "+str(right)+ " were parsed correctly, and "+str(wrong)+" not.")
    rate = 100 / count * right
    print("Match rate: "+str(rate))
    return parses

In [None]:
my_parses = eval_cxns(df)

In [None]:
df["cxn_parsed"] = my_parses

In [None]:
df.to_csv("parse_anno.csv", sep='\t', encoding='utf-8')

In [None]:
def eval_cxns_debug(dataframe):
    count = 0
    right = 0
    wrong = 0
    for index, row in dataframe.iterrows():
        if row['speaker'] != 'CHI':
            count = count + 1
            anno = row['cxn_manual']
            parse = find_cxn_debug(row['mor_utterance'], row['clean_utterance'])
            if anno in parse:
                right = right + 1
                #print(row['clean_utterance'])
                #print("Manual anno: " + anno)
                #print(parse)
            else:
                wrong = wrong + 1
                if anno and parse != None:
                    print(row['clean_utterance'])
                    print("Manual anno: " + anno)
                    print(parse)
    print("For "+str(count)+" sentences, "+str(right)+ " were parsed correctly, and "+str(wrong)+" not.")
    rate = 100 / count * right
    print("Match rate: "+str(rate))

In [None]:
eval_cxns_debug(df)

# Full parsing run

In [None]:
file_list = []
for dirpath, dirnames, filenames in os.walk("C:/Users/User/Desktop/converted_temp/"):
    for filename in [f for f in filenames if f.endswith(".csv")]:
        file_list.append(os.path.join(dirpath, filename))

In [None]:
file_list

In [None]:
for file in file_list:
    anno_dataframe = pd.read_csv(file, sep="\t", index_col=0, na_filter = False)
    parses = add_cxns(anno_dataframe)
    anno_dataframe["cxn_parsed"] = parses
    file_name = file.split("/")
    file_name = file_name[-1]
    output_file_name = file_name.split(".")
    output_file_name = output_file_name[0]
    output_file_name = output_file_name + "-annotated.csv"
    anno_dataframe.to_csv("C:/Users/User/Desktop/converted_temp/annotated/"+output_file_name, sep='\t', encoding='utf-8')