In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np


import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import time
import json

from preprocessing_dailydialogue import *

In [2]:
# Define constant
# Default word tokens
#
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
MAX_LENGTH = 50  # Maximum sentence length to consider
MIN_COUNT = 1    # Minimum word count threshold for trimming
save_dir = os.path.join("data", "save")


In [3]:
DATA_PATH = '../data/ijcnlp_dailydialog'
corpus_name = 'dialogues_text.txt'
emotions_file = 'dialogues_emotion.txt'
corpus = os.path.join(DATA_PATH, corpus_name)
emotions = os.path.join(DATA_PATH, emotions_file)

voc, pairs, pairs_emotion = loadPrepareData(corpus, emotions)

Start preparing training data ...
Reading lines...
Read 89862 sentence pairs
Trimmed to 61642 sentence pairs
Counting words...
Counted words: 12548


In [4]:
emotion_count = {}
non_emotion = 0
have_emotion = 0
for conversation,emotions in zip(pairs,pairs_emotion):
    post,response = conversation
    post_e,response_e = emotions
    
    if post_e != 0:
        have_emotion += 1
        for word in post.split(' '):
            emotion_count[word] = emotion_count.get(word,0) + 1
    else:
        non_emotion += 1
    if response_e != 0:
        have_emotion += 1
        for word in response.split(' '):
            emotion_count[word] = emotion_count.get(word,0) + 1
    else:
        non_emotion += 1

print('No emotion output {}'.format(non_emotion))
print('Emotion output {}'.format(have_emotion))


No emotion output 98044
Emotion output 25240


In [6]:
for each in emotion_count:
    try:
        emotion_count[each] = emotion_count[each] / voc.word2count[each]
    except KeyError:
        continue

In [9]:
ememory = []
threshold = 0.25
min_count_emotion = 3

In [10]:
for each in emotion_count:
    try:
        if voc.word2count[each] > min_count_emotion and emotion_count[each] > threshold:
            ememory.append(each)
    except KeyError:
        continue
print('Total {} words'.format(len(ememory)))
        

Total 2136 words


In [15]:
len(ememory)

2136

In [16]:
with open('ememory.txt','w') as f:
    for each in ememory:
        f.write(each +'\n')
    