In [1]:
import re
import os
import sys
import requests
from pathlib import Path
from itertools import chain

import numpy as np
from bs4 import BeautifulSoup

In [2]:
LASER_PATH = Path("..")
sys.path.append(str(LASER_PATH.resolve()))
os.environ["LASER"] = str(LASER_PATH.resolve())

MODEL_PATH = Path("../models")

from source.shortcuts import lines_to_index

In [3]:
# Local network settings
proxies = {
  'http': 'socks5h://192.168.199.10:12133',
  'https': 'socks5h://192.168.199.10:12133',
}

In [4]:
def cjk_detect(texts):
    # korean
    if re.search("[\uac00-\ud7a3]", texts):
        return "ko"
    # japanese
    if re.search("[\u3040-\u30ff]", texts):
        return "ja"
    # chinese
    if re.search("[\u4e00-\u9FFF]", texts):
        return "zh"
    return None

## Collect Data

[Feedly Stream API documentation](https://developer.feedly.com/v3/streams/)

In [5]:
english_feeds = [
    "https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Europe.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Politics.xml",
    "http://feeds.nytimes.com/nyt/rss/Business",
    "http://feeds.nytimes.com/nyt/rss/Technology",
    "https://www.nytimes.com/svc/collections/v1/publish/https://www.nytimes.com/section/opinion/rss.xml"
]

In [6]:
def fetch_latest(feed_url, count=300):
    res = requests.get(
        'https://cloud.feedly.com//v3/streams/contents'
        f'?streamId=feed/{feed_url}&count={count}',
        proxies=proxies
    )
    return res.json()

In [7]:
english_items = list(chain.from_iterable([
    fetch_latest(x)["items"] for x in english_feeds]))
# Don't include briefings and make sure not CJK characters in the title
english_titles = [x["title"] for x in english_items if ("Briefing" not in x["title"]) and (not cjk_detect(x["title"]))]
english_titles[:5]

['Why Crocodiles Are Not Just Living Fossils',
 'Becoming a Digital Grandparent',
 'The Myth of Period Syncing',
 'What Was the Cause of the Excruciating Pain in His Shoulders and Hips?',
 'China Summons Tech Giants to Warn Against Cooperating With Trump Ban']

In [8]:
chinese_items = fetch_latest('https://cn.nytimes.com/rss/', count=50)["items"]

In [9]:
# collect news title in English
chinese_titles, translated_titles = [], []
for item in chinese_items:
    if "title" not in item or "简报" in item["title"]:
        print(f"skipped {item['originId']}")
        continue
    res = requests.get(item["originId"] + "dual/", proxies=proxies)
    soup = BeautifulSoup(res.text)
    if soup.find("h1", attrs={"class": "en-title"}):
        translated_titles.append(soup.find("h1", attrs={"class": "en-title"}).text)
        chinese_titles.append(item["title"])
    else:
        print(f"English Title Not Found for {item['title']}")
len(chinese_titles)

English Title Not Found for 75年前，敲响纳粹德国丧钟的那个作战日
skipped https://cn.nytimes.com/morning-brief/20190606/xi-jinping-china-russia-richard-liu-rape-video/
skipped https://cn.nytimes.com/morning-brief/20190605/tiananmen-anniversary-china-food-prices/
English Title Not Found for 后天安门时代的“失忆人民共和国”
skipped https://cn.nytimes.com/morning-brief/20190604/tiananmen-30th-anniversary/
English Title Not Found for 带你去看曼哈顿悬日
English Title Not Found for “六四”前夕部分中国异见者Twitter账号被封
skipped https://cn.nytimes.com/morning-brief/20190603/power-games-tiananmen-crackdown-trade-war/
English Title Not Found for 关于赵小兰家族与中国的密切联系，你应该知道的五个要点
English Title Not Found for 前军官谈“六四”：派军队清场，就不好再出牌了
English Title Not Found for 一场没有“互扔泥巴”的辩论
skipped https://cn.nytimes.com/morning-brief/20190531/june-4th-in-chengdu-trade-war-tariff-manufacturer/
English Title Not Found for 小天安门：美国议员回忆成都“六四”
skipped https://cn.nytimes.com/morning-brief/20190530/huawei-us-lawsuit-anchors-debate-trade/
English Title Not Found for 向前一步，竞选美国总统


35

In [10]:
english_titles = list(set(english_titles + translated_titles))

## Tokenization and Indexing

In [11]:
data_en, index_en = lines_to_index(
    "en", english_titles, 
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False,
    batch_size=20
)

 - Tokenizer: source in language en  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 1693 sentences in 12s
 - embedding: /tmp/tmpt_f819j9/enc 1693 examples of dim 1024
 - creating FAISS index


In [12]:
data_zh, index_zh = lines_to_index(
    "zh", chinese_titles, 
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False,
    batch_size=5
)

 - Tokenizer: source in language zh  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 35 sentences in 0s
 - embedding: /tmp/tmp2fr7uink/enc 35 examples of dim 1024
 - creating FAISS index


## Evaluation

In [13]:
# Top 3 predictions (Nearest 3 Neighbors)
_, matched_indices = index_en.search(data_zh, 3)

In [14]:
top1_correct, top3_correct = 0, 0
for i, ztitle in enumerate(chinese_titles):
    print(
        "Chinese:    ", ztitle, "\n",
        "Correct:    ", translated_titles[i], "\n",
        "Predict(1): ", english_titles[matched_indices[i, 0]], "\n",
        "Predict(2): ", english_titles[matched_indices[i, 1]], "\n",
        "Predict(3): ", english_titles[matched_indices[i, 2]], "\n",
        sep=""
    )
    if english_titles[matched_indices[i, 0]] == translated_titles[i]:
        top1_correct += 1
    if translated_titles[i] in (
        english_titles[matched_indices[i, 0]],
        english_titles[matched_indices[i, 1]],
        english_titles[matched_indices[i, 2]]):
        top3_correct += 1
    print("-" * 20)
print(f"Top 1 Accuracy: {top1_correct / len(chinese_titles) * 100:.2f}%")
print(f"Top 3 Accuracy: {top3_correct / len(chinese_titles) * 100:.2f}%")

Chinese:    一场谋杀，一个破碎的加拿大移民梦
Correct:    A Tale of Murder, Revenge and a Canadian Immigrant Dream Gone Wrong
Predict(1): A Tale of Murder, Revenge and a Canadian Immigrant Dream Gone Wrong
Predict(2): Hong Kong Dispatch: A Trump Bump for Hong Kong’s Last Commercial Cantonese Opera Theater
Predict(3): Gunmen Attack Pakistan Luxury Hotel Used by Chinese

--------------------
Chinese:    刘强东案引发中国强奸文化讨论
Correct:    In China, a Viral Video Sparks a Challenge to Rape Culture
Predict(1): In China, a Viral Video Sparks a Challenge to Rape Culture
Predict(2): In China, a Viral Video Sets Off a Challenge to Rape Culture
Predict(3): Treasury Declines to Label China Currency Manipulator

--------------------
Chinese:    中国反垄断机构处罚福特在华合资公司
Correct:    Ford Is Fined in China as Trade Fight With U.S. Rages
Predict(1): In Push for Trade Deal, Trump Administration Shelves Sanctions Over China’s Crackdown on Uighurs
Predict(2): China Strikes Defiant Stance on Trade Against Trump
Predict(3): Treasury Decl