In [1]:
import os
import sys
import requests
from pathlib import Path
from itertools import chain

import numpy as np
from bs4 import BeautifulSoup

In [2]:
LASER_PATH = ".."
sys.path.append(LASER_PATH + '/source')
sys.path.append(LASER_PATH + '/source/lib')

CACHE_PATH = Path("cache/")
CACHE_PATH.mkdir(exist_ok=True)
MODEL_PATH = Path("../models")

os.environ["LASER"] = LASER_PATH 

In [3]:
# Local network settings
os.environ["http_proxy"] = "http://127.0.0.1:11233"
os.environ["https_proxy"] = "http://127.0.0.1:11233"

In [4]:
from indexing import IndexCreate
from embed import SentenceEncoder, EncodeLoad, EncodeFile
from text_processing import Token, BPEfastApply

In [5]:
!rm cache/title*

rm: cannot remove 'cache/title*': No such file or directory


## Collect Data

[Feedly Stream API documentation](https://developer.feedly.com/v3/streams/)

In [6]:
english_feeds = [
    "https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Europe.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Politics.xml",
    "http://feeds.nytimes.com/nyt/rss/Business",
    "http://feeds.nytimes.com/nyt/rss/Technology",
    "https://www.nytimes.com/svc/collections/v1/publish/https://www.nytimes.com/section/opinion/rss.xml"
]

In [7]:
def fetch_latest(feed_url, count=500):
    res = requests.get(
        'https://cloud.feedly.com//v3/streams/contents'
        f'?streamId=feed/{feed_url}&count={count}')
    return res.json()

In [8]:
english_items = list(chain.from_iterable([
    fetch_latest(x)["items"] for x in english_feeds]))
english_titles = [x["title"] for x in english_items if "Briefing" not in x["title"]]
english_titles[:5]

['Europe Flags American Territories in ‘Dirty Money’ List, Deepening Rift With U.S.',
 'Airbus to Halt Production of A380 Jumbo Jet as Orders Dry Up',
 'Indonesia’s Next Election Is in April. The Islamists Have Already Won.',
 'A New Model to Stop the Next School Shooting',
 'Editorial Observer: Everyone Needs Legal Help. That Doesn’t Mean Everyone Needs a Lawyer.']

In [9]:
chinese_items = fetch_latest('https://cn.nytimes.com/rss/', count=30)["items"]

In [10]:
# collect news title in English
chinese_titles, translated_titles = [], []
for item in chinese_items:
    if "title" not in item or "简报" in item["title"]:
        print(f"skipped {item['originId']}")
        continue
    res = requests.get(item["originId"] + "dual/")
    soup = BeautifulSoup(res.text)
    if soup.find("h1", attrs={"class": "en-title"}):
        translated_titles.append(soup.find("h1", attrs={"class": "en-title"}).text)
        chinese_titles.append(item["title"])
    else:
        print(f"English Title Not Found for {item['title']}")
# for a, b in zip(chinese_titles, translated_titles):
#     print(a, b, sep="\n")
#     print("-" * 20)
len(chinese_titles)

English Title Not Found for 伊斯兰革命40年后的伊朗（漫画）
skipped https://cn.nytimes.com/morning-brief/20190214/trump-china-zhang-yimou-berlin-film/
English Title Not Found for “涂黑脸”是种族歧视吗？
skipped https://cn.nytimes.com/morning-brief/20190213/trump-china-trade-lin-zhao/
English Title Not Found for 狱中血书：中共为何害怕死去的林昭
English Title Not Found for 回应“裸照门”，贝佐斯自创新词？
skipped https://cn.nytimes.com/morning-brief/20190212/china-internet-censorship-trump-border-wall/
English Title Not Found for 澳大利亚取消中国富商黄向墨永久居住权
English Title Not Found for 与机器人谈一场恋爱？
skipped https://cn.nytimes.com/morning-brief/20190211/china-workers-protests-turkey-uighurs/
skipped https://cn.nytimes.com/?utm_source=RSS


19

In [11]:
english_titles = list(set(english_titles + translated_titles))

In [12]:
with open(CACHE_PATH / "title.zh", "w") as fout:
    fout.write("\n".join(chinese_titles))
with open(CACHE_PATH / "title.en", "w") as fout:
    fout.write("\n".join(english_titles))    

## Tokenization

In [13]:
encoder = SentenceEncoder(
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"),
    max_sentences=None,
    max_tokens=10000,
    cpu=False)

In [14]:
bpe_codes = str(MODEL_PATH / "93langs.fcodes")

In [15]:
for lang in ("zh", "en"):
    Token(
        str(CACHE_PATH / f"title.{lang}"),
        str(CACHE_PATH / f"title.{lang}.tok"),
        lang=lang,
        romanize=False,
        lower_case=True, gzip=False,
        verbose=True)
    BPEfastApply(
        str(CACHE_PATH / f"title.{lang}.tok"),
        str(CACHE_PATH / f"title.{lang}.bpe"),
        bpe_codes,
        verbose=True, over_write=True)
    EncodeFile(
        encoder,
        str(CACHE_PATH / f"title.{lang}.bpe"),
        str(CACHE_PATH / f"title.{lang}.enc"),
        verbose=True, over_write=True)    

 - Tokenizer: title.zh in language zh  
 - fast BPE: processing title.zh.tok
 - Encoder: title.zh.bpe to title.zh.enc
 - Encoder: 19 sentences in 0s
 - Tokenizer: title.en in language en  
 - fast BPE: processing title.en.tok
 - Encoder: title.en.bpe to title.en.enc
 - Encoder: 2992 sentences in 0s


## Indexing and Evaluation

In [16]:
data_en, index_en = IndexCreate(
    str(CACHE_PATH / "title.en.enc"), 'FlatL2', verbose=True, save_index=False)
data_zh, index_zh = IndexCreate(
    str(CACHE_PATH / "title.zh.enc"), 'FlatL2', verbose=True, save_index=False)

 - embedding: cache/title.en.enc 2992 examples of dim 1024
 - creating FAISS index
 - embedding: cache/title.zh.enc 19 examples of dim 1024
 - creating FAISS index


In [17]:
# Top 3 predictions (Nearest 3 Neighbors)
_, matched_indices = index_en.search(data_zh, 3)

In [18]:
top1_correct, top3_correct = 0, 0
for i, ztitle in enumerate(chinese_titles):
    print(
        "Chinese:    ", ztitle, "\n",
        "Correct:    ", translated_titles[i], "\n",
        "Predict(1): ", english_titles[matched_indices[i, 0]], "\n",
        "Predict(2): ", english_titles[matched_indices[i, 1]], "\n",
        "Predict(3): ", english_titles[matched_indices[i, 2]], "\n",
        sep=""
    )
    if english_titles[matched_indices[i, 0]] == translated_titles[i]:
        top1_correct += 1
    if translated_titles[i] in (
        english_titles[matched_indices[i, 0]],
        english_titles[matched_indices[i, 1]],
        english_titles[matched_indices[i, 2]]):
        top3_correct += 1
    print("-" * 20)
print(f"Top 1 Accuracy: {top1_correct / len(chinese_titles) * 100:.2f}%")
print(f"Top 3 Accuracy: {top3_correct / len(chinese_titles) * 100:.2f}%")

Chinese:    决定成功的“两种规则”
Correct:    The Two Codes Your Kids Need to Know
Predict(1): A Tale of Two Trumps
Predict(2): The Case Against ‘Border Security’
Predict(3): Personal Stories Behind the ‘Green Book’

--------------------
Chinese:    文革专家马若德：照亮那个难以理解的时代
Correct:    Roderick MacFarquhar, Eminent China Scholar, Dies at 88
Predict(1): Nonfiction: Where Trolls Reigned Free: A New History of Reddit
Predict(2): Walter H. Munk, Scientist-Explorer Who Illuminated the Deep, Dies at 101
Predict(3): Election 2018 Misinformation Roundup: ‘Problematic’ Text Messages and Doctored Mailers

--------------------
Chinese:    张艺谋文革题材新片退出柏林电影节
Correct:    Film Set in China’s Cultural Revolution Is Pulled From Berlin Festival
Predict(1): Film Set in China’s Cultural Revolution Is Pulled From Berlin Festival
Predict(2): Army Issues New Reprimand to Leader of Green Beret Team Ambushed in Niger
Predict(3): Reformed Gang Leader in Denmark Shot Dead Leaving Book Party

--------------------
Chinese:    美国将