In [2]:
import os
import pickle
import codecs
import spacy
from pattern.en import conjugate
coref_nlp = spacy.load('en_coref_md')
nlp = spacy.load('en_core_web_sm')

In [30]:
def coreference(text):
    pronoun = ['we all','i', 'you', 'we', 'they', 'he', 'she', 'it', 'me', 'your', 'his', 'her', 'him', 'myself', 'our', 'himself', 'ourselves','this guy']
    pronoun.sort(key=len)
    doc = coref_nlp(text)
    if doc._.has_coref:
        for coref in doc._.coref_clusters:
            main = None
            if str(coref.main).lower() in pronoun:
                for mention in coref.mentions:
                    if not str(mention).lower() in pronoun:
                        main = str(mention)
                if main == None:
                    continue
            for mention in coref.mentions:
                text = text.replace(f" {str(mention)} ",f" {str(main)}")
    return text

In [31]:
def transform_verb(token):
    tag = token.tag_
    res = token.text
    if tag == 'VBP' or tag == 'VB':
        res = conjugate(res, '3sg')
    return res

In [32]:
def transform_pronoun(speaker, text):
    if text == "You're welcome ." or text == 'Thank you .' or text == 'The pleasure is mine .':
        return text
    speakers = ["Mr.One", "Mr.Two"]
    you = speakers[(speakers.index(speaker) + 1) % 2]
    text = text.replace('don ’ t', 'do not')
    text = text.replace('your ', f"{you}'s ")
    text = text.replace('Your ', f"{you}'s ")
    text = text.replace(" ’ s ", f"'s ")
    text = text.replace(" ’ Ve ", f" have ")
    text = text.replace(" ’ re ", f" are ")
    doc = nlp(text)
    verb = None
    text_list = []
    for i in range(len(doc)):
        if verb != None:
            verb = None
            continue
        token = doc[i]
        word = str(token.text)
        index = token.i
        if word == "n't":
            word = 'not'
        if word == "'ll":
            word = 'will'
        if word == "'s" and token.pos_ == 'VERB':
            word = 'is'
        if word == "'d":
            word = token.lemma_
        if str(token.pos_) == 'PRON':
            if word.lower() == 'i' or word.lower() == 'me':
                word = f'{speaker}'
            elif word.lower() in ['we', 'us', 'our']:
                    word = f"Mr.A and Mr.B"
            elif word.lower() == 'you':
                is_you_trans = True
                try:
                    is_you_trans = not(doc[index-1].text.lower() == 'thank')
                except:
                    pass
                if is_you_trans:
                    word = f'{you}'
            elif word.lower() == 'my':
                word = f"{speaker}'s"
            elif word.lower() == 'myself':
                word = f'by {speaker}'
            elif word.lower() == 'yourself':
                word = f'by {you}'
            elif word.lower() == 'your':
                word = f"{you}'s"
            if str(token.dep_) == 'nsubj' and str(token.text.lower()) != 'we':
                if index != 0:
                    if doc[index-1].tag_ in ['VBP','VB']:
                        if not ' ' in text_list[-1]:
                            f_verb = transform_verb(doc[index-1])
                            text_list = text_list[:-1]
                            word = f_verb + ' ' + word
#                             print('before',text_list + [word])
                if index != len(doc) - 1:
                    is_question = False
                    if index != 0:
                        is_question = (str(doc[index-1].tag_)== 'MD')
                        is_question = is_question or (str(doc[index-1].text).lower() in ['does', 'is'])
                    if not is_question:
                        if doc[index+1].tag_ in ['VBP','VB']:
                            verb = transform_verb(doc[index+1])
                            word = word + ' ' + verb
                            

        text_list.append(str(word))
    res = " ".join(text_list)
    return res

In [43]:
def get_dialogues(lines):
    turn = 1
    dialogues = []
    texts = ""
    original_lines = lines
    for line in lines:
        line = line.strip()
        text = line[line.index(':')+2:].strip() 
        texts += text + '\n'
    coref_texts = coreference(texts)
    lines = coref_texts.strip().split('\n')
    for i in range(len(lines)):
        line = original_lines[i]
#         print(lines[i])
        speaker = line[:1]
        if speaker == 'A':
            speaker = 'Mr.One'
        if speaker == 'B':
            speaker = 'Mr.Two'
        open_paren_index = line.index('(')
        close_paren_index = line.index(')')
        act = line[open_paren_index+1:close_paren_index]
        text = transform_pronoun(speaker, lines[i])
        doc = nlp(text)
        tokens = [t.text for t in doc]
        pos = [t.pos_ for t in doc]
        tags = [t.tag_ for t in doc]
        dialogue = {
            'speaker': speaker,
            'turn': turn,
            'act': act,
            'text': tokens,
            'tags': tags,
            'pos':pos
        }
        dialogues.append(dialogue)
        turn += 1
    return dialogues

In [26]:
def readfile(in_filename, inpath):
    in_filename = os.path.abspath(f"{inpath}\\{in_filename}")
    with codecs.open(in_filename, "r",'utf8') as infile:
        lines = infile.readlines()
    return lines

In [27]:
def writefile(out_filename, data, outpath):
    in_filename = os.path.abspath(f"{outpath}\\{out_filename}")
    with open(out_filename, 'wb') as outfile:
        pickle.dump(data,outfile)

In [46]:
def main(filenames=None):
    inpath = 'corpus'
    outpath = 'pos_token2'
    if filenames == None:
        filenames = [f for f in os.listdir(inpath) if os.path.isfile(os.path.join(inpath, f))]
#     print(filenames[0])
    counter = 1
    for in_filename in filenames:
        out_filename = f'{outpath}/{in_filename[:-4]}.pickle'
        lines = readfile(in_filename, inpath)
        dialogues = get_dialogues(lines)
#         print(dialogues)
        writefile(out_filename, dialogues, outpath)
        if counter % 100 == 0:
            print(f'{counter}/{len(filenames)}')
        counter += 1
#         print(out_filename)

In [55]:
out_filename= 'pos_token2/1.pickle'
with open(out_filename, 'rb') as outfile:
    data = pickle.load(outfile)
print(data)

[{'speaker': 'Mr.One', 'turn': 1, 'act': 'directive', 'text': ['So', 'None', ',', 'how', 'about', 'getting', 'some', 'coffee', 'for', 'tonight', '?'], 'tags': ['RB', 'NN', ',', 'WRB', 'IN', 'VBG', 'DT', 'NN', 'IN', 'NN', '.'], 'pos': ['ADV', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT']}, {'speaker': 'Mr.Two', 'turn': 2, 'act': 'commissive', 'text': ['Coffee', '?', 'Mr', '.', 'Two', 'does', 'not', 'honestly', 'like', 'that', 'kind', 'of', 'stuff', '.'], 'tags': ['NN', '.', 'NNP', '.', 'CD', 'VBZ', 'RB', 'RB', 'VB', 'DT', 'NN', 'IN', 'NN', '.'], 'pos': ['NOUN', 'PUNCT', 'PROPN', 'PUNCT', 'NUM', 'VERB', 'ADV', 'ADV', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT']}, {'speaker': 'Mr.One', 'turn': 3, 'act': 'directive', 'text': ['Come', 'on', ',', 'Mr', '.', 'Two', 'can', 'at', 'least', 'try', 'a', 'little', ',', 'besides', 'Mr', '.', 'Two', "'s", 'cigarette', '.'], 'tags': ['VB', 'RP', ',', 'NNP', '.', 'CD', 'MD', 'IN', 'JJS', 'VB', 'DT', 'JJ', ',', 'IN', 'N

In [47]:
main()

100/13118
200/13118
300/13118
400/13118
500/13118
600/13118
700/13118
800/13118
900/13118
1000/13118
1100/13118
1200/13118
1300/13118
1400/13118
1500/13118
1600/13118
1700/13118
1800/13118
1900/13118
2000/13118
2100/13118
2200/13118
2300/13118
2400/13118
2500/13118
2600/13118
2700/13118
2800/13118
2900/13118
3000/13118
3100/13118
3200/13118
3300/13118
3400/13118
3500/13118
3600/13118
3700/13118
3800/13118
3900/13118
4000/13118
4100/13118
4200/13118
4300/13118
4400/13118
4500/13118
4600/13118
4700/13118
4800/13118
4900/13118
5000/13118
5100/13118
5200/13118
5300/13118
5400/13118
5500/13118
5600/13118
5700/13118
5800/13118
5900/13118
6000/13118
6100/13118
6200/13118
6300/13118
6400/13118
6500/13118
6600/13118
6700/13118
6800/13118
6900/13118
7000/13118
7100/13118
7200/13118
7300/13118
7400/13118
7500/13118
7600/13118
7700/13118
7800/13118
7900/13118
8000/13118
8100/13118
8200/13118
8300/13118
8400/13118
8500/13118
8600/13118
8700/13118
8800/13118
8900/13118
9000/13118
9100/13118
9200/131

In [49]:
main(['13118.txt'])

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\ku\\4-1\\nlp\\summarization\\corpus\\13118.txt'