In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nlpaug.augmenter.word
import plotly.figure_factory as ff
import pickle


import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../data/raw/en2cn-2k.en2nen2cn", "r", encoding = 'utf-8') as file:
    text = file.read()
    
text = text.split('\n')
text.pop()

data = []
for idx in range(0, len(text), 3):
    row = []
    row.append(text[idx])
    row.append(text[idx+1])
    row.append(text[idx+2])
    data.append(row)

df = pd.DataFrame(data, columns = ['Informal', 'Formal English', 'Formal Chinese'])
display(df.head())
df = df[['Informal', 'Formal English']]

Unnamed: 0,Informal,Formal English,Formal Chinese
0,"U wan me to ""chop"" seat 4 u nt?",Do you want me to reserve seat for you or not?,你要我帮你预留坐位吗？
1,Yup. U reaching. We order some durian pastry a...,Yeap. You reaching? We ordered some Durian pas...,对。你要到了吗？我们已经点了一些榴莲糕点。你快点来。
2,They become more ex oredi... Mine is like 25.....,They become more expensive already. Mine is li...,他们变得更贵了。我的是大概25。这么坏然后他们还比我以前做得少。
3,I'm thai. what do u do?,I'm Thai. What do you do?,我是泰国人。你做什么？
4,Hi! How did your week go? Haven heard from you...,Hi! How did your week go? Haven't heard from y...,嗨！你这周过的怎么样？好长时间没听到你的消息了。一切顺利吗？


In [3]:
for text in df['Formal English'].values:
    augmented = pd.DataFrame({"Informal":[nlpaug.augmenter.word.SynonymAug(aug_src = 'wordnet').augment(text)], "Formal English":[text]})
    df = df.append(augmented, ignore_index = True)

for text in df['Formal English'].values:
    augmented = pd.DataFrame({"Informal":[nlpaug.augmenter.word.SpellingAug().augment(text)], "Formal English":[text]})
    df = df.append(augmented, ignore_index = True)

df.tail()

Unnamed: 0,Informal,Formal English
7995,[Hmm. I think me usally book on weekends. It' ...,Hmm. I think I usually book on weekends. It de...
7996,[Can you ask Then wheater they have aslo andy ...,Can you ask them whether they have for any sms...
7997,[WE a near Coca already.],We are near Coca already.
7998,[Hall elleven. Got lectures. And forget ahout ...,Hall eleven. Got lectures. And forget about co...
7999,[I bing for yoou. ai can not promise you 100% ...,I bring for you. I can not promise you 100% to...


In [4]:
preprocessed_df = pd.DataFrame()
preprocessed_df['encoder_input'] = df.apply(lambda row: '<'+str(row['Informal'])+'>', axis=1)
preprocessed_df['decoder_input'] = df.apply(lambda row: '<'+str(row['Formal English']), axis=1)
preprocessed_df['decoder_output'] = df.apply(lambda row: str(row['Formal English']+'>'), axis=1)
preprocessed_df.head()

Unnamed: 0,encoder_input,decoder_input,decoder_output
0,"<U wan me to ""chop"" seat 4 u nt?>",<Do you want me to reserve seat for you or not?,Do you want me to reserve seat for you or not?>
1,<Yup. U reaching. We order some durian pastry ...,<Yeap. You reaching? We ordered some Durian pa...,Yeap. You reaching? We ordered some Durian pas...
2,<They become more ex oredi... Mine is like 25....,<They become more expensive already. Mine is l...,They become more expensive already. Mine is li...
3,<I'm thai. what do u do?>,<I'm Thai. What do you do?,I'm Thai. What do you do?>
4,<Hi! How did your week go? Haven heard from yo...,<Hi! How did your week go? Haven't heard from ...,Hi! How did your week go? Haven't heard from y...


In [5]:
fig = ff.create_distplot([preprocessed_df['encoder_input'].apply(len).values], ['Count'])
fig.update_layout(title= 'Length of Encoder Input (Informal text)')
fig.show()

display(stats.describe(preprocessed_df['encoder_input'].apply(len)))

DescribeResult(nobs=8000, minmax=(4, 296), mean=79.750375, variance=2162.048568430429, skewness=0.9376381803285172, kurtosis=0.4122105004678076)

In [6]:
fig = ff.create_distplot([preprocessed_df['decoder_input'].apply(len).values], ['Count'])
fig.update_layout(title= 'Length of Decoder Input (Normalized text)')
fig.show()

display(stats.describe(preprocessed_df['decoder_input'].apply(len)))

DescribeResult(nobs=8000, minmax=(4, 282), mean=72.438, variance=1953.5483495436931, skewness=0.9469251210577967, kurtosis=0.40664669799959263)

In [7]:
fig = ff.create_distplot([preprocessed_df['decoder_output'].apply(len).values], ['Count'])
fig.update_layout(title= 'Length of Decoder Output (Normalized text)')
fig.show()

display(stats.describe(preprocessed_df['decoder_output'].apply(len)))

DescribeResult(nobs=8000, minmax=(4, 282), mean=72.438, variance=1953.5483495436931, skewness=0.9469251210577967, kurtosis=0.40664669799959263)

In [8]:
preprocessed_df = preprocessed_df[(preprocessed_df['encoder_input'].apply(len) <= 200) & (preprocessed_df['decoder_input'].apply(len) <= 200) & (preprocessed_df['decoder_output'].apply(len) <= 200)]
print('Total samples after filtering are:', preprocessed_df.shape[0])

Total samples after filtering are: 7878


In [9]:
train, validation = train_test_split(preprocessed_df, train_size=0.9, random_state = 42)
validation, test = train_test_split(validation, test_size = 0.5, random_state = 42)
train.reset_index(inplace=True, drop=True)
validation.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
train.to_csv('../data/processed/train.csv')
validation.to_csv('../data/processed/validation.csv')
test.to_csv('../data/processed/test.csv')
print('Shape of Training set:', train.shape)
print('Shape of Validation set:', validation.shape)
print('Shape of Test set:', test.shape)

Shape of Training set: (7090, 3)
Shape of Validation set: (394, 3)
Shape of Test set: (394, 3)


In [10]:
tokenizer_informal = Tokenizer(filters = '"#$%&()*+-/=@[\\]^_`{|}~\t\n', lower = False, char_level = True)
tokenizer_informal.fit_on_texts(train['encoder_input'].values)

tokenizer_normalized = Tokenizer(filters = '"#$%&()*+-/=@[\\]^_`{|}~\t\n', lower = False, char_level = True)
tokenizer_normalized.fit_on_texts(train['decoder_input'].values)

tokenizer_hashmap = {'informal': tokenizer_informal, 'normalized': tokenizer_normalized}

with open('../model/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer_hashmap, file, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
print('Vocab size of Informal text:', len(tokenizer_informal.word_index.keys()))
print('Vocab size of Normalized text:', len(tokenizer_normalized.word_index.keys()))

Vocab size of Informal text: 121
Vocab size of Normalized text: 91


In [12]:
padded_encoder_input_sequence = pad_sequences(tokenizer_informal.texts_to_sequences(train['encoder_input'].values), maxlen = 200, dtype='int32', padding='post')
padded_decoder_input_sequence = pad_sequences(tokenizer_normalized.texts_to_sequences(train['decoder_input'].values), maxlen = 200, dtype='int32', padding='post')
padded_decoder_output_sequence = pad_sequences(tokenizer_normalized.texts_to_sequences(train['decoder_output'].values), maxlen = 200, dtype='int32', padding='post')

print('Original sentence:')
print(train['encoder_input'][0], '\n')

print('Tokenized and padded input sentence:')
print(padded_encoder_input_sequence[0], '\n')

print('Length of tokenized and padded input sentence:', padded_encoder_input_sequence.shape[1])

Original sentence:
<Hope so... call ya when i'm better :)> 

Tokenized and padded input sentence:
[22 33  3 24  2  1  9  3 11 11 11  1 19  5 12 12  1 14  5  1 20 10  2  6
  1  7 17 16  1 28  2  4  4  2  8  1 67 77 21  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0] 

Length of tokenized and padded input sentence: 200
