In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train_en = pd.read_csv('data/train_en.csv')
df_train_tcn = pd.read_csv('data/train_tcn.csv')

In [46]:
df_test = pd.read_csv('data/test_tcn.csv')
df_dev_tcn= pd.read_csv('data/dev_tcn.csv')
df_dev_en= pd.read_csv('data/dev_en.csv')

<b>Note:</b> Use dev_en and dev_tcn as matching translation sets to get Bleu score for trained model.

## Pre-Processing

In [5]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
df_train_tcn.head()

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel


## Create Monolingual Embeddings

Tokenize Chinese text

In [70]:
df_test

Unnamed: 0,text,split
0,【PolarStar】美麗諾羊毛保暖襪『淺灰』P18634,public
1,甜蜜水晶~天然水晶五行珠手鍊10mm手鍊,public
2,粉晶六角柱純銀項鍊,public
3,3M SCOTCH VHB 超強力雙面膠-戶外專用 V1808,public
4,燈專屬優惠 *4盒,public
...,...,...
9995,RAINS BACKPACK 後背包 公事包 筆電包 丹麥品牌 白色,private
9996,Airwalk 凉鞋 童鞋 深蓝色 中童 A823230180 no002,private
9997,NORNS 【迪士尼5000mAh行動電源(泰瑞鴨)】迪士尼 鴨嘴獸泰瑞 充電,private
9998,"DERWENT 達爾文設計製圖鉛筆 鐵盒12支(6B-4H),設計*34214",private


In [71]:
!pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.1-py2.py3-none-win_amd64.whl (726 kB)
Installing collected packages: opencc
Successfully installed opencc-1.1.1


In [50]:
import opencc
converter = opencc.OpenCC('t2s.json')

In [52]:
df_test['text'] = df_test['text'].apply(lambda x: converter.convert(x))

Tokenize text

In [108]:
import jieba
jieba.enable_paddle()

Paddle enabled successfully......


In [112]:
def tokenize(s):
    seg_list  =  jieba.cut(s)   # The default is the exact mode 
    s=" ".join (seg_list)
    return s.split()

df_test['text_tokenized'] = df_test['text'].apply(lambda x: tokenize(x))

## Train Model on External Data

Unsupervised NMT is too difficult and requires complex training parameters that will not be possible with given data

<b>Pivot 1: </b> Use dictionaries and define a rule based translation method

In [169]:
zh_dict = pd.read_csv('data/zh-en.txt', sep=" ", header=None)
zh_dict = zh_dict.set_index(0).T.to_dict('list')

  


In [170]:
len(zh_dict)

13786

In [171]:
def translate_zh(line):
    res=""
    for c in line:
        try:
            c=zh_dict[c][0]
            res+=c
        except KeyError:
            res+=c
    return res

In [172]:
def translate_zh_token(line):
    res=[]
    for c in line:
        try:
            c=zh_dict[c][0]
            res.append(c)
        except KeyError:
            c = translate_zh(c)
            res.append(c)
    return res

In [173]:
df_test['preds_tokenized']=df_test['text_tokenized'].apply(lambda x: translate_zh_token(x))

In [174]:
def un_tokenize(s):
    res=""
    for c in s:
        res+=(c+" ")
    return res

In [175]:
df_test['preds'] = df_test['preds_tokenized'].apply(lambda x: un_tokenize(x))

In [176]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

unrecognized = {}
def remove_cn(s):
    res = ''
    for c in s:
        en = isEnglish(c)
        if en or c in ['【','】']:
            res+=c
        else:
            res+=' '
            try:
                unrecognized[c]+=1
            except KeyError:
                unrecognized[c]=1
    return res

In [177]:
df_test['preds'] = df_test['preds'].apply(lambda x: remove_cn(x))

In [178]:
ur = pd.DataFrame(unrecognized.items(),columns=['char','count']).sort_values('count',ascending=False)

In [180]:
converter = opencc.OpenCC('s2t.json')
ur['tcn'] = ur['char'].apply(lambda x: converter.convert(x))

In [181]:
ur[:20]

Unnamed: 0,char,count,tcn
24,款,396,款
113,套,379,套
29,带,342,帶
148,台,341,臺
147,贴,336,貼
140,长,334,長
152,子,333,子
229,宝,331,寶
311,纹,330,紋
158,爱,330,愛


In [183]:
df_test.rename(columns={'preds':'translation_output'},inplace=True)
df_test['translation_output'].to_csv('results1.csv',index=False)