In [1]:
import os
import pickle
import codecs
import spacy
from pattern.en import conjugate
coref_nlp = spacy.load('en_coref_md')
nlp = spacy.load('en_core_web_sm')

In [2]:
def coreference(text):
    pronoun = ['we all','i', 'you', 'we', 'they', 'he', 'she', 'it', 'me', 'your', 'his', 'her', 'him', 'myself', 'our', 'himself', 'ourselves','this guy']
    pronoun.sort(key=len)
    doc = coref_nlp(text)
    if doc._.has_coref:
        for coref in doc._.coref_clusters:
            main = None
            if str(coref.main).lower() in pronoun:
                for mention in coref.mentions:
                    if not str(mention).lower() in pronoun:
                        main = str(mention)
                if main == None:
                    continue
            for mention in coref.mentions:
                text = text.replace(f" {str(mention)} ",f" {str(main)}")
    return text

In [3]:
def transform_verb(token):
    tag = token.tag_
    res = token.text
    if tag == 'VBP' or tag == 'VB':
        res = conjugate(res, '3sg')
    return res

In [4]:
def transform_pronoun(speaker, text):
    if text == "You're welcome ." or text == 'Thank you .' or text == 'The pleasure is mine .':
        return text
    speakers = ["Mr.One", "Mr.Two"]
    you = speakers[(speakers.index(speaker) + 1) % 2]
    text = text.replace('don ’ t', 'do not')
    text = text.replace('your ', f"{you}'s ")
    text = text.replace('Your ', f"{you}'s ")
    text = text.replace(" ’ s ", f"'s ")
    text = text.replace(" ’ Ve ", f" have ")
    text = text.replace(" ’ re ", f" are ")
    doc = nlp(text)
    verb = None
    text_list = []
    for i in range(len(doc)):
        if verb != None:
            verb = None
            continue
        token = doc[i]
        word = str(token.text)
        index = token.i
        if word == "n't":
            word = 'not'
        if word == "'ll":
            word = 'will'
        if word == "'s" and token.pos_ == 'VERB':
            word = 'is'
        if word == "'d":
            word = token.lemma_
        if str(token.pos_) == 'PRON':
            if word.lower() == 'i' or word.lower() == 'me':
                word = f'{speaker}'
            elif word.lower() in ['we', 'us', 'our']:
                    word = f"Mr.A and Mr.B"
            elif word.lower() == 'you':
                is_you_trans = True
                try:
                    is_you_trans = not(doc[index-1].text.lower() == 'thank')
                except:
                    pass
                if is_you_trans:
                    word = f'{you}'
            elif word.lower() == 'my':
                word = f"{speaker}'s"
            elif word.lower() == 'myself':
                word = f'by {speaker}'
            elif word.lower() == 'yourself':
                word = f'by {you}'
            elif word.lower() == 'your':
                word = f"{you}'s"
            if str(token.dep_) == 'nsubj' and str(token.text.lower()) != 'we':
                if index != 0:
                    if doc[index-1].tag_ in ['VBP','VB']:
                        if not ' ' in text_list[-1]:
                            f_verb = transform_verb(doc[index-1])
                            text_list = text_list[:-1]
                            word = f_verb + ' ' + word
#                             print('before',text_list + [word])
                if index != len(doc) - 1:
                    is_question = False
                    if index != 0:
                        is_question = (str(doc[index-1].tag_)== 'MD')
                        is_question = is_question or (str(doc[index-1].text).lower() in ['does', 'is'])
                    if not is_question:
                        if doc[index+1].tag_ in ['VBP','VB']:
                            verb = transform_verb(doc[index+1])
                            word = word + ' ' + verb
                            

        text_list.append(str(word))
    res = " ".join(text_list)
    return res

In [5]:
def get_dialogues(lines):
    turn = 1
    dialogues = []
    texts = ""
    original_lines = lines
    for line in lines:
        line = line.strip()
        text = line[line.index(':')+2:].strip() 
        texts += text + '\n'
    coref_texts = coreference(texts)
    lines = coref_texts.strip().split('\n')
    for i in range(len(lines)):
        line = original_lines[i]
#         print(lines[i])
        speaker = line[:1]
        if speaker == 'A':
            speaker = 'Mr.One'
        if speaker == 'B':
            speaker = 'Mr.Two'
        open_paren_index = line.index('(')
        close_paren_index = line.index(')')
        act = line[open_paren_index+1:close_paren_index]
        text = transform_pronoun(speaker, lines[i])
        doc = nlp(text)
        tokens = [t.text for t in doc]
        pos = [t.pos_ for t in doc]
        tags = [t.tag_ for t in doc]
        dialogue = {
            'speaker': speaker,
            'turn': turn,
            'act': act,
            'text': tokens,
            'tags': tags,
            'pos':pos
        }
        dialogues.append(dialogue)
        turn += 1

In [6]:
def readfile(in_filename, inpath):
    in_filename = os.path.abspath(f"{inpath}\\{in_filename}")
    with codecs.open(in_filename, "r",'utf8') as infile:
        lines = infile.readlines()
    return lines

In [7]:
def writefile(out_filename, data, outpath):
    in_filename = os.path.abspath(f"{outpath}\\{out_filename}")
    with open(out_filename, 'wb') as outfile:
        pickle.dump(data,outfile)

In [None]:
inpath = 'corpus'
outpath = 'pos_token2'
filenames = [f for f in os.listdir(inpath) if os.path.isfile(os.path.join(inpath, f))]
# filenames
for in_filename in filenames:
    out_filename = f'{outpath}/{in_filename[:-4]}.pickle'
    lines = readfile(in_filename, inpath)
    dialogues = get_dialogues(lines)
    writefile(out_filename, dialogues, outpath)
    print(out_filename)

pos_token2/0.pickle
pos_token2/1.pickle
pos_token2/10.pickle
pos_token2/100.pickle
pos_token2/1000.pickle
pos_token2/10000.pickle
pos_token2/10001.pickle
pos_token2/10002.pickle
pos_token2/10003.pickle
pos_token2/10004.pickle
pos_token2/10005.pickle
pos_token2/10006.pickle
pos_token2/10007.pickle
pos_token2/10008.pickle
pos_token2/10009.pickle
pos_token2/1001.pickle
pos_token2/10010.pickle
pos_token2/10011.pickle
pos_token2/10012.pickle
pos_token2/10013.pickle
pos_token2/10014.pickle
pos_token2/10015.pickle
pos_token2/10016.pickle
pos_token2/10017.pickle
pos_token2/10018.pickle
pos_token2/10019.pickle
pos_token2/1002.pickle
pos_token2/10020.pickle
pos_token2/10021.pickle
pos_token2/10022.pickle
pos_token2/10023.pickle
pos_token2/10024.pickle
pos_token2/10025.pickle
pos_token2/10026.pickle
pos_token2/10027.pickle
pos_token2/10028.pickle
pos_token2/10029.pickle
pos_token2/1003.pickle
pos_token2/10030.pickle
pos_token2/10031.pickle
pos_token2/10032.pickle
pos_token2/10033.pickle
pos_token