In [1]:
import os
import sys
import requests
from pathlib import Path
from itertools import chain

import numpy as np
from bs4 import BeautifulSoup

In [2]:
LASER_PATH = Path("..")
sys.path.append(str(LASER_PATH.resolve()))
os.environ["LASER"] = str(LASER_PATH.resolve())

MODEL_PATH = Path("../models")

from source.shortcuts import lines_to_index

In [3]:
# Local network settings
proxies = {
  'http': 'socks5h://127.0.0.1:12133',
  'https': 'socks5h://127.0.0.1:12133',
}

## Collect Data

[Feedly Stream API documentation](https://developer.feedly.com/v3/streams/)

In [4]:
english_feeds = [
    "https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/AsiaPacific.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Europe.xml",
    "https://www.nytimes.com/services/xml/rss/nyt/Politics.xml",
    "http://feeds.nytimes.com/nyt/rss/Business",
    "http://feeds.nytimes.com/nyt/rss/Technology",
    "https://www.nytimes.com/svc/collections/v1/publish/https://www.nytimes.com/section/opinion/rss.xml"
]

In [5]:
def fetch_latest(feed_url, count=300):
    res = requests.get(
        'https://cloud.feedly.com//v3/streams/contents'
        f'?streamId=feed/{feed_url}&count={count}',
        proxies=proxies
    )
    return res.json()

In [6]:
english_items = list(chain.from_iterable([
    fetch_latest(x)["items"] for x in english_feeds]))
english_titles = [x["title"] for x in english_items if "Briefing" not in x["title"]]
english_titles[:5]

['Li Rui, a Mao Confidant Who Turned Party Critic, Dies at 101',
 'Nigeria Postpones Election Just Hours Before Polls Open',
 'Mueller’s Office Recommends Paul Manafort Serve Up to 25 Years in Prison',
 'An Intrepid Explorer of Mars Falls Silent',
 'Lens: Martin Parr: 48 Years of Photographing the Quirky and Kitschy in Manchester']

In [7]:
chinese_items = fetch_latest('https://cn.nytimes.com/rss/', count=50)["items"]

In [8]:
# collect news title in English
chinese_titles, translated_titles = [], []
for item in chinese_items:
    if "title" not in item or "简报" in item["title"]:
        print(f"skipped {item['originId']}")
        continue
    res = requests.get(item["originId"] + "dual/", proxies=proxies)
    soup = BeautifulSoup(res.text)
    if soup.find("h1", attrs={"class": "en-title"}):
        translated_titles.append(soup.find("h1", attrs={"class": "en-title"}).text)
        chinese_titles.append(item["title"])
    else:
        print(f"English Title Not Found for {item['title']}")
len(chinese_titles)

English Title Not Found for 当创新和言论自由成为一种武器
skipped https://cn.nytimes.com/morning-brief/20190215/gui-minhai-sweden-china-new-zealand-huawei/
English Title Not Found for 伊斯兰革命40年后的伊朗（漫画）
skipped https://cn.nytimes.com/morning-brief/20190214/trump-china-zhang-yimou-berlin-film/
English Title Not Found for “涂黑脸”是种族歧视吗？
skipped https://cn.nytimes.com/morning-brief/20190213/trump-china-trade-lin-zhao/
English Title Not Found for 狱中血书：中共为何害怕死去的林昭
English Title Not Found for 回应“裸照门”，贝佐斯自创新词？
skipped https://cn.nytimes.com/morning-brief/20190212/china-internet-censorship-trump-border-wall/
English Title Not Found for 澳大利亚取消中国富商黄向墨永久居住权
English Title Not Found for 与机器人谈一场恋爱？
skipped https://cn.nytimes.com/morning-brief/20190211/china-workers-protests-turkey-uighurs/
skipped https://cn.nytimes.com/?utm_source=RSS
English Title Not Found for “合法独裁者”（漫画）
English Title Not Found for 什么是“off the record”？
skipped https://cn.nytimes.com/morning-brief/20190201/trump-xi-trade-talk-birth-tourism-crackdow

31

In [9]:
english_titles = list(set(english_titles + translated_titles))

## Tokenization and Indexing

In [10]:
data_en, index_en = lines_to_index(
    "en", english_titles, 
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False
)

 - Tokenizer: source in language en  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 1776 sentences in 0s
 - embedding: /tmp/tmp6s2mrot9/enc 1776 examples of dim 1024
 - creating FAISS index


In [11]:
data_zh, index_zh = lines_to_index(
    "zh", chinese_titles, 
    str(MODEL_PATH / "bilstm.93langs.2018-12-26.pt"), 
    str(MODEL_PATH / "93langs.fcodes"),
    use_cpu=False
)

 - Tokenizer: source in language zh  
 - fast BPE: processing token
 - Encoder: bpe to enc
 - Encoder: 31 sentences in 0s
 - embedding: /tmp/tmps0e8jse5/enc 31 examples of dim 1024
 - creating FAISS index


## Evaluation

In [12]:
# Top 3 predictions (Nearest 3 Neighbors)
_, matched_indices = index_en.search(data_zh, 3)

In [13]:
top1_correct, top3_correct = 0, 0
for i, ztitle in enumerate(chinese_titles):
    print(
        "Chinese:    ", ztitle, "\n",
        "Correct:    ", translated_titles[i], "\n",
        "Predict(1): ", english_titles[matched_indices[i, 0]], "\n",
        "Predict(2): ", english_titles[matched_indices[i, 1]], "\n",
        "Predict(3): ", english_titles[matched_indices[i, 2]], "\n",
        sep=""
    )
    if english_titles[matched_indices[i, 0]] == translated_titles[i]:
        top1_correct += 1
    if translated_titles[i] in (
        english_titles[matched_indices[i, 0]],
        english_titles[matched_indices[i, 1]],
        english_titles[matched_indices[i, 2]]):
        top3_correct += 1
    print("-" * 20)
print(f"Top 1 Accuracy: {top1_correct / len(chinese_titles) * 100:.2f}%")
print(f"Top 3 Accuracy: {top3_correct / len(chinese_titles) * 100:.2f}%")

Chinese:    中国游客在澳大利亚：中产新富和文化鸿沟
Correct:    In Tour of Australia, Chinese Admire Clean Air but Bemoan Lack of Hot Drinking Water
Predict(1): Australia Cancels Residency for Wealthy Chinese Donor Linked to Communist Party
Predict(2): Twitter Users in China Face Detention and Threats in New Beijing Crackdown
Predict(3): Xi Jinping’s New Year Tour: Dumplings and Riot Gear

--------------------
Chinese:    孤立、隐形和麻木：是枝裕和电影中的日本阴暗面
Correct:    ‘Shoplifters’ Director Pierces Japan’s Darker Side
Predict(1): Showmanship, and Conflict, as Darts Goes From Smoky Pubs to TV Spectacle
Predict(2): Dumplings, Figure Skating and Riot Gear: Xi Jinping’s New Year Tour
Predict(3): The Saturday Profile: ‘Shoplifters’ Director Pierces Japan’s Darker Side

--------------------
Chinese:    谢天谢地，我们还有加拿大！
Correct:    Thank God for Canada!
Predict(1): Thank God for Canada!
Predict(2): The Women in White: Praise From Trump, and Chants of ‘U.S.A.!’
Predict(3): A Moral Leader ... and Boring? Canadians Respond

-----