In [1]:
## Preprocessing-데이터 전처리

In [2]:
import pandas as pd
import numpy as np
import re
import json 

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
DATA_IN_PATH = './data_in/'
FILTERS = "([~.,!?\"':;)(])"

change_filter = re.compile(FILTERS)
train_data = pd.read_csv(DATA_IN_PATH + 'train.csv', encoding='utf-8')

In [4]:
train_pos_data = train_data.loc[train_data['is_duplicate'] == 1]
train_neg_data = train_data.loc[train_data['is_duplicate'] == 0]

class_difference = len(train_neg_data) - len(train_pos_data)
sample_frac = 1 -(class_difference / len(train_neg_data))

train_neg_data = train_neg_data.sample(frac = sample_frac) # .sample 샘플링

In [5]:
print("중복 질문 개수: {}".format(len(train_pos_data)))
print("중복이 아닌 질문 개수: {}".format(len(train_neg_data)))

중복 질문 개수: 149263
중복이 아닌 질문 개수: 149263


In [6]:
train_data = pd.concat([train_neg_data, train_pos_data])    #라벨에 따라 나눠진 데이터를 다시 하나로 합치기
questions1 = [str(s) for s in train_data['question1']]
questions2 = [str(s) for s in train_data['question2']]
questions1[0:42]

['How do you cook canned kidney beans?',
 'How can I get the guy I like to like me again?',
 'How can 4.6x30 mm rounds have more Armor Piercing capability than 9x19 mm rounds?',
 'What\'s the difference in meaning between these two sentences, "a minute upon waking up, I can barely feel my feet," and "upon a minute of waking up, I can barely feel my feet"?',
 'Is wikibuy.com legit or a scam?',
 'What is the most pretentious thing you have ever heard someone say?',
 'How good is MIS course at ASU?',
 'What is the best thing someone ever did for you?',
 'What is the future of iot?',
 'Why do people sometimes call Obama POTUS?',
 "For an average workout I max out at 180-210 bpm depending on what I'm doing. Is this safe?",
 'Why alpha position is more active in naphthalene? As most groups attack at Alpha position?',
 'What are some of the finest works/roles of Indian veteran actor Om Puri?',
 'Does true love still exists or its just about sex?',
 'What is high powered money? How can it be u

In [7]:
# 각 데이터에 있는 두 개의 질문을 각각 리스트 형태로 만든 후 각 리스트에 대해 소문자로 바꾸고 필터로 기호제거 처리
# 그 후 전처리된 리스트를 만드는 과정


# questions1 = [str(s) for s in train_data['question1']]
# questions2 = [str(s) for s in train_data['question2']]

filtered_questions1 = list()
filtered_questions2 = list()

for q in questions1:
     filtered_questions1.append(re.sub(change_filter, "", q).lower())
        
for q in questions2:
     filtered_questions2.append(re.sub(change_filter, "", q).lower())  
        
filtered_questions1[0]
filtered_questions2[0]

'how do i cook canned beans'

In [8]:
filtered_questions1[0:42]

['how do you cook canned kidney beans',
 'how can i get the guy i like to like me again',
 'how can 46x30 mm rounds have more armor piercing capability than 9x19 mm rounds',
 'whats the difference in meaning between these two sentences a minute upon waking up i can barely feel my feet and upon a minute of waking up i can barely feel my feet',
 'is wikibuycom legit or a scam',
 'what is the most pretentious thing you have ever heard someone say',
 'how good is mis course at asu',
 'what is the best thing someone ever did for you',
 'what is the future of iot',
 'why do people sometimes call obama potus',
 'for an average workout i max out at 180-210 bpm depending on what im doing is this safe',
 'why alpha position is more active in naphthalene as most groups attack at alpha position',
 'what are some of the finest works/roles of indian veteran actor om puri',
 'does true love still exists or its just about sex',
 'what is high powered money how can it be used as a regulatory mechanism 

In [9]:
# 문자열 데이터 형식 저장
clean_train_q1_df = pd.DataFrame({'questions1': filtered_questions1, 'is_duplicate': train_data['is_duplicate']})
clean_train_q2_df = pd.DataFrame({'questions2': filtered_questions2, 'is_duplicate': train_data['is_duplicate']})


In [10]:
# 두 질문 텍스트를 합친 리스트에 대해 도크나이징 객체를 정용
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_questions1 + filtered_questions2)

In [11]:
# 각 질문에 대해 따로 토크나이징
questions1_sequence = tokenizer.texts_to_sequences(filtered_questions1)
questions2_sequence = tokenizer.texts_to_sequences(filtered_questions2)

In [12]:
# 최대길이 보다 작으면 자르고, 짧으면 패딩
MAX_SEQUENCE_LENGTH = 31
q1_data = pad_sequences(questions1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(questions2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [13]:

word_vocab = {}
word_vocab = tokenizer.word_index 
word_vocab["<PAD>"] = 0

labels = np.array(train_data['is_duplicate'], dtype=int)

print('Shape of question1 data: {}'.format(q1_data.shape))
print('Shape of question2 data:{}'.format(q2_data.shape))
print('Shape of label: {}'.format(labels.shape))
print("Words in index: {}".format(len(word_vocab)))

Shape of question1 data: (298526, 31)
Shape of question2 data:(298526, 31)
Shape of label: (298526,)
Words in index: 76531


In [14]:
# 단어 사전과 전체 단어의 개수 딕셔너리 형태로 저장
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab)

In [15]:
# 넘파이의 save함수를 활용해 각 질문과 라벨 데이터를 저장
TRAIN_Q1_DATA = 'train_q1.npy'
TRAIN_Q2_DATA = 'train_q2.npy'
TRAIN_LABEL_DATA = 'train_label.npy'

TRAIN_CLEAN_Q1_DATA = 'train_clean_q1.csv'
TRAIN_CLEAN_Q2_DATA = 'train_clean_q2.csv'

DATA_CONFIGS = 'data_configs.json'


# 전처리된 데이터를 넘파이 형태로 저장
np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA, 'wb'), q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), labels)

# 정제된 데이터를 CSV형태로 저장
clean_train_q1_df.to_csv(DATA_IN_PATH + TRAIN_CLEAN_Q1_DATA, index = False)
clean_train_q2_df.to_csv(DATA_IN_PATH + TRAIN_CLEAN_Q2_DATA, index = False)

# e데이터 사전을 JSON 형태로 저장
json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))

In [16]:
# 평가 데이터 불러오기
test_data = pd.read_csv(DATA_IN_PATH + 'test.csv', encoding='utf-8')
test_data.head()


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [17]:
valid_ids = [type(x) == int for x in test_data.test_id]
test_data = test_data[valid_ids].drop_duplicates()
test_data.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [18]:
# 평가 데이터 전처리
test_questions1 = [str(s) for s in test_data['question1']]
test_questions2 = [str(s) for s in test_data['question2']]
filtered_test_questions1 = list()
filtered_test_questions2 = list()

for q in test_questions1:
    filtered_test_questions1.append(re.sub(change_filter, "", q).lower())
    
for q in test_questions2:
    filtered_test_questions2.append(re.sub(change_filter, "", q).lower())

In [19]:
# 문자열 데이터 저장 (TF-IDF 용)
clean_test_q1_df = pd.DataFrame({'questions1': filtered_test_questions1, 'test_id': test_data['test_id']})
clean_test_q2_df = pd.DataFrame({'questions2': filtered_test_questions2, 'test_id': test_data['test_id']})

In [20]:
# 패딩 처리
test_questions1_sequence = tokenizer.texts_to_sequences(filtered_test_questions1)
test_questions2_sequence = tokenizer.texts_to_sequences(filtered_test_questions2)

test_q1_data = pad_sequences(test_questions1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_questions2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [21]:
# 평가 데이터의 id 값을 넘파이 배열로 생성
test_id = np.array(test_data['test_id'])

print('Shape of question1 data: {}'.format(test_q1_data.shape))
print('Shape of question2 data: {}'.format(test_q2_data.shape))
print('Shape of ids: {}'.format(test_id.shape))

Shape of question1 data: (2345796, 31)
Shape of question2 data: (2345796, 31)
Shape of ids: (2345796,)


In [22]:
# 전처리 한 평가 데이터 파일로 저장
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'

TEST_CLEAN_Q1_DATA = 'test_clean_q1.csv'
TEST_CLEAN_Q2_DATA = 'test_clean_q2.csv'

TEST_ID_DATA = 'test_id.npy'

np.save(open(DATA_IN_PATH + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA, 'wb'), test_q2_data)
np.save(open(DATA_IN_PATH + TEST_ID_DATA, 'wb'), test_id)

clean_test_q1_df.to_csv(DATA_IN_PATH + TEST_CLEAN_Q1_DATA, index=False)
clean_test_q2_df.to_csv(DATA_IN_PATH + TEST_CLEAN_Q2_DATA, index=False)